User:ChristieBot/Update historical GAs data.py

# This script tries to keep the historical_GA_reviews table up to date.
# It works as follows
# 1. Get a set of GA pages -- usually ones created or moved since the last run
# 2. Go through those and if the data for them is already in the history table, correct the reviewer and review_ts if necessary
# 3. Look through them again and this time insert any records not in the history table
# 4. Set the "needs_analysis" flag.  This is usually used to determine which records should be analysed and updated.
# 5. Use the where clause to determine what records will actually be analysed
# 6. Loop through the list of records and try various ways to determine what the values in the history table should be set to.

# Third party modules
import pywikibot
import re
import datetime
import sys
import os
import pymysql
import configparser
import operator
from pywikibot.data.api import PropertyGenerator
import time
from dateutil.parser import parse

# Local modules
sys.path.append('./www/python/src') # Not needed if I run from that directory
from GA import Topic, Subtopic, Nom, Review_stats, WBGAN, Active_nomination, GAN, Name_changes, Nom_list
#import GA_config_test as GA_config
import GA_config
from GA_history import GAH, FailedGA, GAnominee, Article_History, GA_article_page, GARlink, GA_talk_page, GA_sub_page, GA_history_Exception, GAO

# Config
HOME=os.environ.get('HOME') #get environment variable $HOME
replica_path=HOME + '/replica.my.cnf'
if os.path.exists(replica_path):          #check that the file is found
    config = configparser.ConfigParser()
    config.read(replica_path)
else:
    print('replica.my.cnf file not found')

site = pywikibot.Site('en','wikipedia')
database = "s55175__ganfilter"
conn = pymysql.connections.Connection(user=config['client']['user'], password=config['client']['password'], database="s55175__ganfilter", host='tools.db.svc.eqiad.wmflabs')
max_review_ts_str = GAH.get_max_review_ts_str(conn, config)
override_sql = True
# Note: use underscores instead of spaces in page names
sql = ""
sql = 'SELECT p.page_title FROM page p inner join logging_logindex l on p.page_id = l.log_page \
                  where log_type = "move" and page_namespace=1 AND page_title like "%/GA_" \
                  and l.log_timestamp > "' + max_review_ts_str + '" \
                  union \
                  SELECT page_title FROM page p inner join revision r on p.page_id = r.rev_page \
                  WHERE page_namespace=1 and r.rev_parent_id = 0 AND page_title LIKE "%/GA_" and rev_timestamp >= "' + max_review_ts_str + '" and rev_timestamp <= "2023-03-10"'
sql = "select p.page_title from page p where p.page_title like 'Sovetsky/GA%' and p.page_namespace = 1"
#sql = 'SELECT page_title FROM page p WHERE page_namespace=1 and page_title LIKE "Twerton_Park%/GA2"'
# The next method finds any moves that happened in the last 24 hours (usually) for which the source page is in the historical database, and it does two things:
# It sets the "needs_analysis" flag on in the database, and it outputs a report to the incomplete moves page for a human to review.
GAH.find_incomplete_moves(conn, config)

# get_rows_to_check will get all pages that have moved since max_review_ts, plus all pages that have been created since max_review_ts by default.
# To override this, set override_sql = True and pass a query in sql which will be used instead
# rows_to_check is the list of GA subpages that need to be reviewed.
rows_to_check = GAH.get_rows_to_check(conn, config, max_review_ts_str, sql, override_sql)

# First check that the reviewer information is correct before we insert them into the historical database
# Any records in the historical database that don't agree with the creation date and creating editor for the review page will be updated
GAH.check_reviewer_data(conn, config, rows_to_check)

# Now we know the database doesn't have any incorrect data for the GA pages in rows_to_check
# Now insert into the historical database any record in rows_to_check that is not already there
GAH.scan_for_new_pages(conn, config, rows_to_check)

# By default set_needs_analysis_flag will set the flag for all pages that have been moved, created, or edited since max_review_ts.
# Pass in sql as a query string to override this.  It should return a list of article titles.
sql = None
need_analysis_count = GAH.set_needs_analysis_flag(conn, config, max_review_ts_str, sql)

# This is the second half of the code and it can be run independently; this is the part that cleans up the history table
# If the needs_analysis flag is set on exactly the ones you want to update, then just run with the default where clause.
conn = pymysql.connections.Connection(user=config['client']['user'], password=config['client']['password'], database="s55175__ganfilter", host='tools.db.svc.eqiad.wmflabs')
cursor = conn.cursor(pymysql.cursors.DictCursor)
#where_clause = "where article_title collate utf8mb4_bin = 'Helene Scheu-Riesz' and page = 1"
#where_clause = "where type is null"
where_clause = "where needs_analysis = 'Y'"
sql = "select article_title, page, review_ts, type, comments, outcome, outcome_ts, nominator, nomination_ts, reviewer, subtopic from " + GA_config.strings['historical GA reviews table name'] + " " + where_clause
cursor.execute(sql)
#print(sql)
ctr = 0
tdelta = datetime.timedelta(0,60) # one minute to use to start searching for revisions
for sql_row in cursor.fetchall():
    row = sql_row
    print("Article: " + row['article_title'])
    subp = GA_sub_page(pywikibot.Page(site, "Talk:" + row['article_title'] + "/GA" + str(row['page'])))
    talkp = GA_talk_page(pywikibot.Page(site, "Talk:" + row['article_title']))
    articlep = GA_article_page(pywikibot.Page(site, row['article_title']))
    searchp = talkp
    has_been_set = {'type': False, 'nominator': False, 'nomination_ts': False, 'subtopic': False, 'outcome': False, 'outcome_ts': False, 'comments': False}
    subp.reset_attributes(conn)
    if subp.assess_state(conn, row, has_been_set, talkp, searchp, articlep, subp):
        subp.unset_needs_analysis_flag(conn)
    else:
        #print("Before ah, has_been_set is " + str(has_been_set))
        subp.try_article_history_update(conn, searchp, has_been_set, row)
        #print("Before GAN_page, has_been_set is " + str(has_been_set))
        subp.try_GAN_page_revisions(conn, talkp, has_been_set, row)
        #print("Before fga, has_been_set is " + str(has_been_set))
        subp.try_failed_GA_update(conn, talkp, has_been_set, row)
        #print("Before dga, has_been_set is " + str(has_been_set))
        subp.try_delisted_GA_update(conn, talkp, has_been_set, row)
        #print("Before wbgan, has_been_set is " + str(has_been_set))
        subp.try_WBGAN(conn, talkp, has_been_set, row, config)
        #print("Before GAR_headers, has_been_set is " + str(has_been_set))
        subp.check_for_GAR_headers(conn, row, has_been_set) 
        #print("Before under_review, has_been_set is " + str(has_been_set))
        subp.check_for_under_review(conn, row, has_been_set)
        #print("Before tpr, has_been_set is " + str(has_been_set))
        subp.try_talk_page_revisions(conn, tdelta, searchp, has_been_set, row)
        #print("Before GA, has_been_set is " + str(has_been_set))
        subp.try_GA(conn, searchp, has_been_set, row)
        subp.unset_needs_analysis_flag(conn)
    ctr += 1
    if ctr % 10 == 0:
        print("Processed " + str(ctr) + " articles")