# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
#
#
# copyright thomasv1 at gmx dot de
__module_name__ = "wikisourcebot"
__module_version__ = "1.0"
__module_description__ = "wikisource interactive regexp bot"
import sys
import socket
import re
import sre_constants
import thread, Queue, time
sys.path.append("../pywikipedia")
import wikipedia, pagegenerators, catlib
def safe_put(page,text):
while True:
try:
status, reason, data = page.put(text)
if data != u'':
print "put error", status, reason
time.sleep(10)
continue
else:
break
except wikipedia.LockedPage:
print "Page %s is locked?!" % page.aslink().encode("utf8")
break
except wikipedia.NoPage:
print "Page does not exist %s" % page.aslink().encode("utf8")
break
except:
print "put error:exception"
time.sleep(5)
continue
def prosify(text):
text = text.replace("<div class=prose>","<div class=\"text\">")
text = text.replace("<div class='prose'>","<div class=\"text\">")
text = text.replace("<div class=\"prose\">","<div class=\"text\">")
if "<div class=text>" not in text \
and "<div class='text'>" not in text \
and "<div class=\"text\">" not in text:
print "adding class text"
return "<div class=\"text\">\n"+text+"\n</div>"
else:
return text
def addheader(text,test,header):
#header=header.encode("utf8")
#for i in range(55): print "t", ord(text[i])
#for i in range(len(header)): print "h", ord(header[i])
if test not in text:
print "adding header"
a = text.find("<div class=\"text\">")
#outside
#return header+"\n\n"+text
if a != -1:
return text[:a+18]+"\n"+header+"\n\n"+text[a+18:]
else:
return header+"\n\n"+text
else:
return text
class NavigationBot:
def __init__(self, generator, booktitle):
self.generator = generator
self.booktitle = booktitle
def run(self, dummy_method):
previouspage = None;
notfound = 0
changed = 0
prevname = "";
prefix_length = len(self.booktitle)
for page in self.generator:
if page.namespace() not in [0,2]: continue
curname=page.title()
print "site",page.site()
if curname.startswith(self.booktitle):
curname="[["+curname+"|"+curname[prefix_length:]+"]]"
else:
curname="[["+curname+"]]"
try:
try:
text = page.get()
except wikipedia.IsRedirectPage:
page = wikipedia.Page(page.site(),page.getRedirectTarget())
text = page.get()
except wikipedia.NoPage:
print "Page %s does not exist?!" % page.aslink().encode("utf8")
notfound += 1
continue
except wikipedia.LockedPage:
print "Page %s is locked?!" % page.aslink().encode("utf8")
notfound += 1
continue
except wikipedia.IsRedirectPage:
print "Page %s is a redirect?!" % page.aslink().encode("utf8")
notfound += 1
continue
if "{{Auteur" in text:
continue
if "{{Navigateur" in text:
continue
page_ok = page
text = addheader(text,u"{{Navigateur",
u"{{Navigateur|"+prevname+"|[["+self.booktitle+"]]|"+"NONEXT"+"}}")
if previouspage:
previoustext = previoustext.replace("NONEXT",curname)
safe_put(previouspage,previoustext)
changed += 1
previouspage = page
previoustext = text
prevname = curname
try:
text = text.replace("NONEXT","")
safe_put(page_ok,text)
changed += 1
except:
pass
return (changed,notfound)
class WsBot:
def __init__(self, generator):
self.generator = generator
def run(self,method):
notfound = 0
changed = 0
for page in self.generator:
print "page is",page
try:
if not page.namespace() in [0,2,6, 104]: continue
try:
text = page.get()
except wikipedia.IsRedirectPage:
page = wikipedia.Page(page.site(),page.getRedirectTarget())
text = page.get()
#???????
text = text.replace("\r\n","\n")
if "{{Auteur" in text: continue
text2 = method(text)
if text2!= text:
safe_put(page,text2)
changed += 1
#print [text]
else:
print "no change for %s" % page.aslink().encode("utf8")
except wikipedia.NoPage:
print "Page %s does not exist?!" % page.aslink().encode("utf8")
notfound += 1
except wikipedia.IsRedirectPage:
print "double redirect" % page.aslink().encode("utf8")
notfound += 1
return (changed,notfound)
def split_page(mysite, rootname, header=""):
changed = 0
notfound = 0
page = wikipedia.Page(mysite,rootname)
text = page.get()
p = re.compile('==([^=]+)==\n')
bl= p.split(text)
titles = '\n'
for i in range(len(bl)/2):
title = bl[i*2+1]
content = bl[i*2+2]
for illegalChar in ['#', '<', '>', '[', ']', '|', '{', '}', '\n', u'\ufffd']:
if illegalChar in title:
title = title.replace(illegalChar,'_')
if header == "":
pagetitle = rootname+" - "+title
elif header == "NOPREFIX":
pagetitle = title
else:
pagetitle = rootname+" - "+header+" "+str(i+1)
#remove trailing whitespaces
while content[-1:] in ['\n',' ']:
content = content[:-1]
same = False
pl = wikipedia.Page(mysite,pagetitle)
while pl.exists():
if pl.get() == content:
print "found same content"
same = True
break
m=re.match("(.*) (\d+)",pagetitle)
if m:
pagetitle = m.group(1)+" "+str(int(m.group(2))+1)
else:
pagetitle+=" 2"
pl = wikipedia.Page(mysite,pagetitle)
titles += "*[["+pagetitle+"|"+title+"]]\n"
if not same:
safe_put(pl,content)
changed += 1
header = bl[0]
safe_put(page,header+titles)
changed += 1
return (changed, notfound)
E_BAD_PAR = 1
E_BAD_NS = 2
E_UNKNOWN = 3
E_IMPORT = 4
E_SPLIT = 5
E_RUNTIME = 6
err_msg = ["","",""]
HOST="irc.freenode.net"
NICK="S89-Bot"
IDENT="S89-Bot"
REALNAME="Bot von Schaengel89 für de.wikisource"
PORT=6667
#read this off the network...
#channels = [ "#wikisource-de" ]
channels = [ "#wikisource-de" ]
#request_pages = [ "User talk:S89-Bot",
request_pages = [ "Benutzer Diskussion:S89-Bot" ]
task_queue = Queue.Queue(0)
def dotask(request_str, text, mysite):
parameters = request_str
try:
print "raw:", [parameters]
parameters = parameters.replace("\\{","{")
parameters = parameters.replace("\\}","}")
parameters = parameters.replace("\\|","@@@k")
parameters = parameters.replace("\\\\","\\")
sss = parameters.split('|')
where = sss[1]
what = sss[2].lower()
comment = sss[0]+" : "+what
try:
c = sss[3]
except:
c = ""
try:
d = sss[4]
except:
d = ""
c = c.replace("@@@k","|")
d = d.replace("@@@k","|")
print "parameters: ", [c,d]
except:
return E_BAD_PAR
if what == 'split':
wikipedia.setAction(comment)
try:
changed, notfound = split_page(mysite, where[2:-2], c)
err_msg[1] = str(changed)
err_msg[2] = str(notfound)
if notfound == 0: err_msg[2] = ""
return 0
except:
return E_SPLIT
elif what =='import':
wikipedia.setAction(comment + " from " + c)
try:
import gallica
data = gallica.gallica_get(c)
pl= wikipedia.Page(mysite,where[2:-2])
pl.put(unicode(data,'latin-1'))
pl= wikipedia.Page(mysite,"Talk:"+where[2:-2])
pl.put("source : "+c)
return 0
except:
return E_IMPORT
if where[:8].lower() == "links:[[":
referredPage = wikipedia.Page(mysite, where[8:-2])
gen = pagegenerators.LinkedPageGenerator(referredPage)
preloadingGen = pagegenerators.PreloadingGenerator(gen)
elif what == 'navigateur':
referredPage = wikipedia.Page(mysite, where[2:-2])
gen = pagegenerators.LinkedPageGenerator(referredPage)
preloadingGen = pagegenerators.PreloadingGenerator(gen)
else:
referredPage = wikipedia.Page(mysite, where[2:-2])
if referredPage.namespace() == 14:
print "got a category"
cat = catlib.Category(mysite, referredPage.titleWithoutNamespace())
gen = pagegenerators.CategorizedPageGenerator(cat)
preloadingGen = pagegenerators.PreloadingGenerator(gen)
else:
preloadingGen = [ referredPage ]
if what in ['text','texte','prose']:
bot = WsBot(preloadingGen)
method = prosify
elif what in ['delete','effacer']:
bot = WsBot(preloadingGen)
method = lambda s: s.replace(c,"")
elif what in ['replace', 'remplacer', 'remplace']:
bot = WsBot(preloadingGen)
method = lambda s: s.replace(c,d)
elif what in ['regexp']:
bot = WsBot(preloadingGen)
def myfun(s):
p = re.compile(c,re.MULTILINE|re.DOTALL)
out = p.sub(d,s)
if len(out) > 2*len(s) and len(out) > 10000:
raise RuntimeError, "regexp generating too much text"
return out
method = myfun
elif what == 'navigateur':
if referredPage.namespace() == 0:
bot = NavigationBot(preloadingGen,where[2:-2])
method = None
else:
return E_BAD_NS
else:
return E_UNKNOWN
wikipedia.setAction(comment)
try:
changed, notfound = bot.run(method)
err_msg[1] = str(changed)
err_msg[2] = str(notfound)
if notfound == 0 : err_msg[2] = ""
except sre_constants.error, ErrorMessage:
err_msg[0] = str(ErrorMessage)
print err_msg[0]
return E_RUNTIME
except RuntimeError, ErrorMessage:
err_msg[0] = str(ErrorMessage)
print err_msg[0]
return E_RUNTIME
return 0
def accept_request(request_str, page):
wikipedia.setAction("accepting request")
while True:
text = page.get()
text2 = text.replace("{{Requete|"+request_str+"}}","{{RequeteEnCours|"+request_str+"}}")
if text2==text:
text2 = text.replace("{{requete|"+request_str+"}}","{{RequeteEnCours|"+request_str+"}}")
try:
page.put(text2)
break
except wikipedia.EditConflict:
time.sleep(5)
continue
def finish_request(request_str, code, request_page, mysite, wait_time, exec_time):
while True:
try:
page = wikipedia.Page(mysite,request_page)
text = page.get()
if code == 0 :
wikipedia.setAction( "done" )
else :
wikipedia.setAction( "error" )
if code == 0:
text = text.replace("{{RequeteEnCours|"+request_str+"}}",
"{{RequeteTerminee|%d|%d|%s|%s|"%(int(wait_time),int(exec_time),err_msg[1],err_msg[2])+request_str+"}}")
elif code == E_BAD_PAR:
text = text.replace("{{RequeteEnCours|"+request_str+"}}",
"{{RequeteInvalide|"+request_str+"}} : could not parse parameters")
elif code == E_BAD_NS:
text = text.replace("{{RequeteEnCours|"+request_str+"}}",
"{{RequeteInvalide|"+request_str+"}} : bad namespace")
elif code == E_UNKNOWN:
text = text.replace("{{RequeteEnCours|"+request_str+"}}",
"{{RequeteInvalide|"+request_str+"}} : request not understood")
elif code == E_IMPORT:
text = text.replace("{{RequeteEnCours|"+request_str+"}}",
"{{RequeteInvalide|"+request_str+"}} : could not import this page")
elif code == E_SPLIT:
text = text.replace("{{RequeteEnCours|"+request_str+"}}",
"{{RequeteInvalide|"+request_str+"}} : could not split this page")
elif code == E_RUNTIME:
text = text.replace("{{RequeteEnCours|"+request_str+"}}",
"{{RequeteInvalide|"+request_str+"}} : runtime error : " + err_msg[0])
status, reason, data = page.put(text)
if data != u'':
print "put error", status, reason
time.sleep(10)
continue
else:
break
except wikipedia.EditConflict:
print "editconflict on "+request_page
time.sleep(10)
continue
def get_connected():
try:
s=socket.socket( )
s.connect((HOST, PORT))
s.send("NICK %s\r\n" % NICK)
s.send("USER %s %s bla :%s\r\n" % (IDENT, HOST, REALNAME))
for channel in channels:
s.send("JOIN %s\r\n"%channel )
return s
except:
#connection broken due to expired waiting time
return None
def bot_listening():
readbuffer=""
while True:
s = get_connected()
while True:
try:
readbuffer+=s.recv(1024)
temp=readbuffer.split("\n")
if temp=='':
print "connection broken"
raise RuntimeError, "connection broken"
readbuffer=temp.pop( )
except:
break
for line in temp:
line=line.rstrip()
full_line = line
line=line.split()
if line[0]=="PING":
try:
s.send("PONG %s\r\n" % line[1])
except:
print "I was disconnected. reconnecting..."
break
elif line[0]=="ERROR":
s = get_connected()
elif line[1]=="PRIVMSG":
try:
pagename = re.search("\[\[(.*?)\]\]",full_line).group(0)[5:-5]
codelang = (line[2].split('.')[0])[1:]
except:
continue
if pagename in request_pages:
task_queue.put((codelang, pagename, time.time()))
def main():
#initial checkup
#for pagename in request_pages:
# task_queue.put((codelang, pagename, time.time()))
thread.start_new_thread(bot_listening,())
wikipedia.put_throttle.setDelay(wikipedia.config.minthrottle)
while 1:
try:
(codelang, pagename, request_time) = task_queue.get_nowait()
except:
time.sleep(0.5)
continue
mysite = wikipedia.getSite(codelang,fam='wikisource')
wikipedia.setSite(mysite)
page = wikipedia.Page(mysite,pagename)
try:
text = page.get()
except:
continue
m=re.search("{{Requete\|(.*?)}}",text,
re.MULTILINE | re.IGNORECASE | re.DOTALL)
if m:
print "job for me!!!\n"
#semaphore: need to tell other bots that I take the request
accept_time = time.time()
accept_request(m.group(1), page)
#one way to know I took it could be to listen to the channel if I did write the page
code = dotask(m.group(1), text, mysite)
finish_request(m.group(1), code, pagename,
mysite, accept_time - request_time, time.time() - accept_time)
else:
#request is not on the page: it must have been taken by some other bot
print "no more requests\n"
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()