Jump to content

User:LemmeyBOT/RefHistoryFix

From Wikipedia, the free encyclopedia
#!/usr/bin/python
# -*- coding: utf-8  -*-
""""
Bot:LemmeyBOT
FileName:RefHistoryFix.py
Author: Lemmey 3-1-2008
Tagline:the bot that does what editors won't
Purpose:Restores references lost due to vandalism, bad editors, massive changes.
Method:Looks back through article history for the lost reference.
""""
__version__ = '$Id: basic.py 3998 2007-08-07 20:28:27Z wikipedian $'
import wikipedia
import pagegenerators
import sys
import BeautifulSoup
import urllib
import re
from datetime import date
message = ""
# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
    '&params;': pagegenerators.parameterHelp
}

rtnln = u'''

'''

class BasicBot:
    msg = {
        'de': u'Bot: Ändere ...',
        'en': u'Restored missing content of named reference using article history',
    }

    def __init__(self, generator, debug):
        self.generator = generator
        self.debug = debug

    def run(self):
        wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), self.msg))
        for page in self.generator:
            self.treat(page)

    def treat(self, page):
        global message
        if page.botMayEdit():
            try:
                text = page.get(throttle = False)
                origional = text
            except wikipedia.NoPage:
                wikipedia.output(u"Page %s does not exist; skipping." % page.aslink())
                return
            except wikipedia.IsRedirectPage:
                wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink())
                return
            except wikipedia.LockedPage:
                wikipedia.output(u"Page %s is locked; skipping." % page.aslink())
                return
            showtext = text
            try:
                text=fixBroken(text,page)
                showtext = text
            except wikipedia.IsRedirectPage:
                Ignore(page.title())
            if showtext != origional:
                wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
                try:
                    wikipedia.showDiff(origional, showtext)
                except:
                    pass

def save_page(page,oldtext,newtext,ref,message):
    print "Message: ",message
    wikipedia.showDiff(oldtext, newtext)
    ##choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
    text = newtext
    choice = 'y' #HARD CODED
    if choice == 'y':
        try:
            page.put(newtext,minorEdit=True,comment=message)
        except wikipedia.EditConflict:
            wikipedia.output(u'Skipping %s because of edit conflict')
        except wikipedia.SpamfilterError, error:
            wikipedia.output(u'Cannot change %s because of spam blacklist entry %s')
            newtext=oldtext.replace(ref,u'{{Fact|date=June 2008}}')
            message = "original reference: "+ref+" is a blacklisted source - replaced with a Fact tag"
            text = newtext
            try:
                text = save_page(page,oldtext,newtext,ref,message)
            except:
                pass
    return text

def get_lost_stubs(text):
    flag = False
    a=Get_Named_Ref_Whole(text)
    b=Get_Named_Ref_Stubs(text)
    
    stub_array=[]
    stub_list=[]
    for match in b:
        name = match.group()
        s=re.compile('= *[^/]+? */',re.I | re.S)
        search = s.search(name)
        if search != None:
            tag = search.group()
            tag = tag[1:-1]
            tag = tag.strip()
            if name not in stub_list: 
                stub_array.append(tag)
                stub_list.append(name)

    whole_array=[]
    for match in a:
        name = match.group()
        s=re.compile('= *[^/]+? ?>',re.I | re.S)
        search = s.search(name)
        if search != None:
            tag = search.group()
            tag = tag[1:-1]
            tag = tag.strip()
            whole_array.append(tag)
    lost_stubs=[]

    for x in range(0,len(stub_array)):
        stub=stub_array[x]
        if stub not in whole_array and (stub,stub_list[x]) not in lost_stubs:
            lost_stubs.append((stub,stub_list[x]))
    return lost_stubs

def fixBroken(text,page):
    b=get_lost_stubs(text)
    global message
    for item in b:
        stub = item[0]
        ref = item[1]
        a=None
        FOUND = False
        title = page.title()
        vh = page.getVersionHistory(getAll = True)   
        for entry in vh:
            text2=page.getEditPage(oldid=entry[0])[0]
            a=Get_Specific_Named_Whole(text2,stub)
            if a!= None and Check_for_Blank(stub,a.group()) == False:
                FOUND=True  
                try:
                    newtext=text.replace(ref,a.group(),1)
                    text = save_page(page,text,newtext,ref,u'Restored missing content of named reference '+ref+ ' using version '+ str(entry[0]))                      
                except:
                    pass
            if FOUND: break

        if not FOUND:
            print "Hit Bottom: ",stub
            Show(title,ref)
            Ignore(title)
    return text

def Check_for_Blank(name,tag):
    pattern='< ?ref ?name *= *'+name+' ?> *< ?/ ?ref ?>'
    a=re.compile(pattern,re.I | re.S)##<ref name = "larry">
    search = a.search(tag)
    if search!= None:
        return True
    else:
        return False

def Get_Specific_Named_Whole(text2,name):
    array=[]
    ##Named refs without closings
    pattern='< ?ref ?name *= *'+name+' ?>.+?< ?/ ?ref ?>'
    a=re.compile(pattern,re.I | re.S)##<ref name = "larry">
    search = a.search(text2)
    return search

def Get_Named_Ref_Stubs(text):
    array=[]
    ##Named refs with closings
    ##pattern = '< *ref *name *= *[\w "-]+? */ *>'
    pattern = '< *ref *name *= *[^>]+? */ *>'
    b=re.compile(pattern,re.I | re.S)##<ref name = "larry"/>
    iterator = b.finditer(text)
    return iterator

def Get_Named_Ref_Whole(text):
    array=[]
    ##Named refs without closings
    pattern = '< ?ref ?name *= *[^/]+? ?>.+?< ?/ ?ref ?>'
    a=re.compile(pattern,re.I | re.S)##<ref name = "larry">
    iterator = a.finditer(text)
    return iterator

def Ignore(article):
    f=open('list.txt', 'a')
    f.write(article + '\n')
    f.close()

def Show(article,ref):
    f=open('bad.txt', 'a')
    f.write('# [[' + article + ']]'+ '  <nowiki>'+ref+'</nowiki>\n')
    f.close()    

def main():
    genFactory = pagegenerators.GeneratorFactory()
    gen = None
    pageTitleParts = []
    debug = False

    for arg in wikipedia.handleArgs():
        if arg.startswith("-debug"):
            debug = True
        else:
            generator = genFactory.handleArg(arg)
            if generator:
                gen = generator
            else:
                pageTitleParts.append(arg)

    if pageTitleParts != []:
        pageTitle = ' '.join(pageTitleParts)
        page = wikipedia.Page(wikipedia.getSite(), pageTitle)
        gen = iter([page])

    if gen:
        gen = pagegenerators.PreloadingGenerator(gen)
        bot = BasicBot(gen, debug)
        bot.run()
    else:
        wikipedia.showHelp()

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()