Benutzer:S89-Bot/Quelltext

aus Wikisource, der freien Quellensammlung
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#
#
#    copyright thomasv1 at gmx dot de





__module_name__ = "wikisourcebot"
__module_version__ = "1.0"
__module_description__ = "wikisource interactive regexp bot"

import sys
import socket
import re
import sre_constants
import thread, Queue, time

sys.path.append("../pywikipedia")

import wikipedia, pagegenerators, catlib



def safe_put(page,text):
    while True:
        try:
            status, reason, data = page.put(text)
            if data != u'':
                print "put error", status, reason
                time.sleep(10)
                continue
            else:
                break
        except wikipedia.LockedPage:
            print "Page %s is locked?!" % page.aslink().encode("utf8")
            break
        except wikipedia.NoPage:
            print "Page does not exist %s" % page.aslink().encode("utf8")            
            break
        except:
            print "put error:exception"
            time.sleep(5)
            continue
        



def prosify(text):
    text = text.replace("<div class=prose>","<div class=\"text\">")
    text = text.replace("<div class='prose'>","<div class=\"text\">")
    text = text.replace("<div class=\"prose\">","<div class=\"text\">")
    
    if "<div class=text>" not in text \
        and "<div class='text'>" not in text \
        and "<div class=\"text\">" not in text:
        print "adding class text"
        return "<div class=\"text\">\n"+text+"\n</div>"
    else:
        return text

def addheader(text,test,header):
    #header=header.encode("utf8")

    #for i in range(55): print "t", ord(text[i])
    #for i in range(len(header)): print "h", ord(header[i])

    if test not in text:
        print "adding header"
        a = text.find("<div class=\"text\">")

        #outside 
        #return header+"\n\n"+text

        if a != -1:
            return text[:a+18]+"\n"+header+"\n\n"+text[a+18:]
        else:
            return header+"\n\n"+text
    else:
        return text







class NavigationBot:
    def __init__(self, generator, booktitle):
        self.generator = generator
        self.booktitle = booktitle

    def run(self, dummy_method):
        previouspage = None;
        notfound = 0
        changed = 0
        prevname = "";
        prefix_length = len(self.booktitle)

        for page in self.generator:
            if page.namespace() not in [0,2]: continue
            curname=page.title()
            print "site",page.site()
            if curname.startswith(self.booktitle):
                curname="[["+curname+"|"+curname[prefix_length:]+"]]"
            else:
                curname="[["+curname+"]]"

            try:
                try:
                    text = page.get()
                except wikipedia.IsRedirectPage:
                    page = wikipedia.Page(page.site(),page.getRedirectTarget())
                    text = page.get()

            except wikipedia.NoPage:
                print "Page %s does not exist?!" % page.aslink().encode("utf8")
                notfound += 1
                continue
            except wikipedia.LockedPage:
                print "Page %s is locked?!" % page.aslink().encode("utf8")
                notfound += 1
                continue
            except wikipedia.IsRedirectPage:
                print "Page %s is a redirect?!" % page.aslink().encode("utf8")
                notfound += 1
                continue

            if "{{Auteur" in text:
                continue
            if "{{Navigateur" in text:
                continue

            page_ok = page
            text = addheader(text,u"{{Navigateur",
                             u"{{Navigateur|"+prevname+"|[["+self.booktitle+"]]|"+"NONEXT"+"}}")

            if previouspage:
                previoustext = previoustext.replace("NONEXT",curname)
                safe_put(previouspage,previoustext)
                changed += 1

            previouspage = page
            previoustext = text
            prevname = curname

        try:
            text = text.replace("NONEXT","")
            safe_put(page_ok,text)
            changed += 1
        except:
            pass

        return (changed,notfound)



class WsBot:
    def __init__(self, generator):
        self.generator = generator

    def run(self,method):

        notfound = 0
        changed = 0

        for page in self.generator:
            print "page is",page
            try:
                if not page.namespace() in [0,2,6, 104]: continue
                try:
                    text = page.get()
                except wikipedia.IsRedirectPage:
                    page = wikipedia.Page(page.site(),page.getRedirectTarget())
                    text = page.get()

                #???????
                text = text.replace("\r\n","\n")

                if "{{Auteur" in text: continue
                text2 = method(text)
                if text2!= text:
                    safe_put(page,text2)
                    changed += 1
                    #print [text]
                else:
                    print "no change for %s" % page.aslink().encode("utf8")
                    

            except wikipedia.NoPage:
                print "Page %s does not exist?!" % page.aslink().encode("utf8")
                notfound += 1
            except wikipedia.IsRedirectPage:
                print "double redirect" % page.aslink().encode("utf8")
                notfound += 1

        return (changed,notfound)


def split_page(mysite, rootname, header=""):
    
    changed = 0
    notfound = 0
    
    page = wikipedia.Page(mysite,rootname)
    text = page.get()
    p = re.compile('==([^=]+)==\n')
    bl= p.split(text)
    titles = '\n'
    for i in range(len(bl)/2):

        title  = bl[i*2+1]
        content = bl[i*2+2]

        for illegalChar in ['#', '<', '>', '[', ']', '|', '{', '}', '\n', u'\ufffd']:
            if illegalChar in title:
                title = title.replace(illegalChar,'_')

        if header == "":
            pagetitle = rootname+" - "+title
        elif header == "NOPREFIX":
            pagetitle = title
        else:
            pagetitle = rootname+" - "+header+" "+str(i+1)

        #remove trailing whitespaces
        while content[-1:] in ['\n',' ']:
            content = content[:-1]
        same = False

        pl = wikipedia.Page(mysite,pagetitle)
        while pl.exists():
            if pl.get() == content:
                print "found same content"
                same = True
                break

            m=re.match("(.*) (\d+)",pagetitle)
            if m:
                pagetitle = m.group(1)+" "+str(int(m.group(2))+1)
            else:
                pagetitle+=" 2"
            pl = wikipedia.Page(mysite,pagetitle)


        titles += "*[["+pagetitle+"|"+title+"]]\n"
        if not same:
            safe_put(pl,content)
            changed += 1


    header = bl[0]
    safe_put(page,header+titles)
    changed += 1

    return (changed, notfound)




E_BAD_PAR = 1
E_BAD_NS  = 2
E_UNKNOWN = 3
E_IMPORT  = 4
E_SPLIT   = 5
E_RUNTIME = 6

err_msg = ["","",""]

                 
HOST="irc.freenode.net"
NICK="S89-Bot"
IDENT="S89-Bot"
REALNAME="Bot von Schaengel89 für de.wikisource"
PORT=6667

#read this off the network...
#channels = [ "#wikisource-de" ]
channels = [ "#wikisource-de" ]

#request_pages = [ "User talk:S89-Bot",
request_pages = [ "Benutzer Diskussion:S89-Bot" ]


task_queue = Queue.Queue(0)



def dotask(request_str, text, mysite):

    parameters = request_str
    try:
        print "raw:", [parameters]
        parameters = parameters.replace("\\{","{")
        parameters = parameters.replace("\\}","}")
        parameters = parameters.replace("\\|","@@@k")
        parameters = parameters.replace("\\\\","\\")
        sss =  parameters.split('|')
        where = sss[1]
        what = sss[2].lower()
        comment = sss[0]+" : "+what
        try:
            c = sss[3]
        except:
            c = ""
        try:
            d = sss[4]
        except:
            d = ""

        c = c.replace("@@@k","|")
        d = d.replace("@@@k","|")
        print "parameters: ", [c,d]
    except:
        return E_BAD_PAR
        
    if what == 'split':
        wikipedia.setAction(comment)
        try:
            changed, notfound = split_page(mysite, where[2:-2], c)
            err_msg[1] = str(changed)
            err_msg[2] = str(notfound)
            if notfound == 0: err_msg[2] = ""
            return 0
        except:
            return E_SPLIT
    
    elif what =='import':
        wikipedia.setAction(comment + " from " + c)
        try:
            import gallica
            data = gallica.gallica_get(c)
            pl= wikipedia.Page(mysite,where[2:-2])
            pl.put(unicode(data,'latin-1'))
            pl= wikipedia.Page(mysite,"Talk:"+where[2:-2])
            pl.put("source : "+c)
            return 0
        except:
            return E_IMPORT
        

    if where[:8].lower() == "links:[[":
        referredPage = wikipedia.Page(mysite, where[8:-2])
        gen = pagegenerators.LinkedPageGenerator(referredPage)
        preloadingGen = pagegenerators.PreloadingGenerator(gen)
    elif what == 'navigateur':
        referredPage = wikipedia.Page(mysite, where[2:-2])
        gen = pagegenerators.LinkedPageGenerator(referredPage)
        preloadingGen = pagegenerators.PreloadingGenerator(gen)
    else:
        referredPage = wikipedia.Page(mysite, where[2:-2])
        if referredPage.namespace() == 14:
            print "got a category"
            cat = catlib.Category(mysite, referredPage.titleWithoutNamespace())
            gen = pagegenerators.CategorizedPageGenerator(cat)
            preloadingGen = pagegenerators.PreloadingGenerator(gen)
        else:
            preloadingGen = [ referredPage ]

    if what in ['text','texte','prose']:
        bot = WsBot(preloadingGen)
        method = prosify
    elif what in ['delete','effacer']:
        bot = WsBot(preloadingGen)
        method = lambda s: s.replace(c,"")
    elif what in ['replace', 'remplacer', 'remplace']:
        bot = WsBot(preloadingGen)
        method = lambda s: s.replace(c,d)
    elif what in ['regexp']:
        bot = WsBot(preloadingGen)
        def myfun(s):
            p = re.compile(c,re.MULTILINE|re.DOTALL)
            out = p.sub(d,s)
            if len(out) > 2*len(s) and len(out) > 10000:
                raise RuntimeError, "regexp generating too much text"
            return out
        method = myfun

    elif what == 'navigateur':
        if referredPage.namespace() == 0:
            bot = NavigationBot(preloadingGen,where[2:-2])
            method = None
        else:
            return E_BAD_NS
    else:
        return E_UNKNOWN

    wikipedia.setAction(comment)
    try:
        changed, notfound = bot.run(method)
        err_msg[1] = str(changed)
        err_msg[2] = str(notfound)
        if notfound == 0 : err_msg[2] = ""

    except sre_constants.error, ErrorMessage:
        err_msg[0] = str(ErrorMessage)
        print err_msg[0] 
        return E_RUNTIME

    except RuntimeError, ErrorMessage:
        err_msg[0] = str(ErrorMessage)
        print err_msg[0] 
        return E_RUNTIME


    return 0
    

def accept_request(request_str, page):
    wikipedia.setAction("accepting request")
    while True:
        text = page.get()
        text2 = text.replace("{{Requete|"+request_str+"}}","{{RequeteEnCours|"+request_str+"}}")
        if text2==text:
            text2 = text.replace("{{requete|"+request_str+"}}","{{RequeteEnCours|"+request_str+"}}")

        try:
            page.put(text2)
            break
        except wikipedia.EditConflict:
            time.sleep(5)
            continue


def finish_request(request_str, code, request_page, mysite, wait_time, exec_time):

    while True:
        try:
            page = wikipedia.Page(mysite,request_page)
            text = page.get()
            if code == 0 :  
                wikipedia.setAction( "done" ) 
            else : 
                wikipedia.setAction( "error" ) 

            if code == 0:
                text = text.replace("{{RequeteEnCours|"+request_str+"}}",
                                    "{{RequeteTerminee|%d|%d|%s|%s|"%(int(wait_time),int(exec_time),err_msg[1],err_msg[2])+request_str+"}}")
            elif code == E_BAD_PAR:
                text = text.replace("{{RequeteEnCours|"+request_str+"}}",
                                    "{{RequeteInvalide|"+request_str+"}} : could not parse parameters")
            elif code == E_BAD_NS:
                text = text.replace("{{RequeteEnCours|"+request_str+"}}",
                                    "{{RequeteInvalide|"+request_str+"}} : bad namespace")
            elif code == E_UNKNOWN:
                text = text.replace("{{RequeteEnCours|"+request_str+"}}",
                                    "{{RequeteInvalide|"+request_str+"}} : request not understood")
            elif code == E_IMPORT:
                text = text.replace("{{RequeteEnCours|"+request_str+"}}",
                                    "{{RequeteInvalide|"+request_str+"}} : could not import this page")
            elif code == E_SPLIT:
                text = text.replace("{{RequeteEnCours|"+request_str+"}}",
                                    "{{RequeteInvalide|"+request_str+"}} : could not split this page")
            elif code == E_RUNTIME:
                text = text.replace("{{RequeteEnCours|"+request_str+"}}",
                                    "{{RequeteInvalide|"+request_str+"}} : runtime error : " + err_msg[0])
            status, reason, data = page.put(text)
            if data != u'':
                print "put error", status, reason
                time.sleep(10)
                continue
            else:
                break

        except wikipedia.EditConflict:
            print "editconflict on "+request_page
            time.sleep(10)
            continue
        





def get_connected():
    try:
        s=socket.socket( )
        s.connect((HOST, PORT))
        s.send("NICK %s\r\n" % NICK)
        s.send("USER %s %s bla :%s\r\n" % (IDENT, HOST, REALNAME))
        for channel in channels:
            s.send("JOIN %s\r\n"%channel )
        return s
    except:
        #connection broken due to expired waiting time
        return None
    



def bot_listening():
    readbuffer=""
    while True:
        s = get_connected()
        while True:
            try:
                readbuffer+=s.recv(1024)
                temp=readbuffer.split("\n")
                if temp=='':
                    print "connection broken"
                    raise RuntimeError, "connection broken"
                readbuffer=temp.pop( )
            except:
                break

            for line in temp:
                line=line.rstrip()
                full_line = line
                line=line.split()

                if line[0]=="PING":
                    try:
                        s.send("PONG %s\r\n" % line[1])
                    except:
                        print "I was disconnected. reconnecting..."
                        break

                elif line[0]=="ERROR":
                    s = get_connected()

                elif line[1]=="PRIVMSG":
                    try:
                        pagename = re.search("\[\[(.*?)\]\]",full_line).group(0)[5:-5]
                        codelang = (line[2].split('.')[0])[1:]
                    except:
                        continue

                    if pagename in request_pages:
                        task_queue.put((codelang, pagename, time.time()))




def main():

    #initial checkup
    #for pagename in request_pages:
    #    task_queue.put((codelang, pagename, time.time()))
    
    thread.start_new_thread(bot_listening,())
    wikipedia.put_throttle.setDelay(wikipedia.config.minthrottle)

    while 1:
        try:
            (codelang, pagename, request_time) = task_queue.get_nowait()
        except:
            time.sleep(0.5)
            continue

        mysite = wikipedia.getSite(codelang,fam='wikisource')
        wikipedia.setSite(mysite)
        page = wikipedia.Page(mysite,pagename)
        try:
            text = page.get()
        except:
            continue

        m=re.search("{{Requete\|(.*?)}}",text,
                    re.MULTILINE | re.IGNORECASE | re.DOTALL)
        if m:
            print "job for me!!!\n"
            #semaphore: need to tell other bots that I take the request
            accept_time = time.time()
            accept_request(m.group(1), page)
            #one way to know I took it could be to listen to the channel if I did write the page
            code = dotask(m.group(1), text, mysite)
            finish_request(m.group(1), code, pagename,
                           mysite, accept_time - request_time, time.time() - accept_time)
        else:
            #request is not on the page: it must have been taken by some other bot
            print "no more requests\n"




if __name__ == "__main__":

    try:
        main()
    finally:
        wikipedia.stopme()