User:Xenobot/communes.py

#!/usr/bin/env python2.5
#version 0.6

import codecs
import re
import wikitools
import settings

wiki = wikitools.Wiki()
wiki.login(settings.username, settings.password)
print 'logged in now'
output_skip = codecs.open('output-commune.txt', 'a', 'utf-8')

f = open('may3morb.txt', 'r')
page_list = f.read().split('\n')
f.close()
print 'retrieved list of pages'

for name in page_list:
    title = unicode(name, 'utf-8')
    page = wikitools.Page(wiki, '%s' % title, followRedir=False)
    if not page.exists:
        continue

    article_text = page.getWikiText()
    if re.search(r'www.insee.fr', article_text, re.I|re.U):
        if re.search(r'\|[ ]*insee[ ]*=[ ]*(\w\w)(\w\w\w)', article_text, re.I|re.U):
            insee = re.search(r'\|[ ]*insee[ ]*=[ ]*(\w\w)(\w\w\w)', article_text, re.I|re.U)
            if not re.search(r'''
==[ ]*References[ ]*==
.*based on the article.*
.*asso.fr.*
.*insee.fr.*
.*www.ign.fr.*''', article_text, re.I|re.U):
                print 'couldn\'t find ref header; skipping %s' % title
                output_skip.write(title + '  --  no ref header found\n')
                output_skip.flush()
                continue
            new_text = re.compile(r'''
==[ ]*References[ ]*==
.*based on the article.*
.*asso.fr.*
.*insee.fr.*
.*www.ign.fr.*''', re.I|re.U).sub('''
== References ==
* [http://www.maires56.asso.fr Mayors of Morbihan Association] {{fr icon}}
* [http://www.insee.fr/fr/methodes/nomenclatures/cog/fichecommunale.asp?codedep=%s&codecom=%s INSEE commune file]''' % (insee.group(1), insee.group(2)), article_text)
            page.edit(new_text, summary=settings.editsumm, bot=1)
            print 'Editing %s' % title
        else:
            print 'couldn\'t find insee; skipping %s' % title
            output_skip.write(title + '  --  no insee param\n')
            output_skip.flush()
            continue
    else:
        print 'Skipping %s' % title
        output_skip.write(title + '  --  no insee.fr url\n')
        output_skip.flush()

output_skip.close()