User:Polbot/source/Polbot.pm

package Polbot;

use strict;
use LWP::UserAgent;

# Here is an example for this sub's usage:
#
# my $url1 = 'http://bioguide.congress.gov/scripts/biodisplay.pl?index=H000671'; 
# print Polbot::bio2wiki($url1);

sub bio2wiki {
	my $url = shift;

	# Constants
	my $pronoun = 'He'; #Unfortunately, there is no way to tell if the person is male or female from the bioguide. I hate assuming male here, but what can you do?
	my $preps = 'in|near|to|at|of';
	my $months = 'January|February|March|April|May|June|July|August|September|October|November|December';
	my $states = 'Alaska|Alabama|Arkansas|Arizona|California|Colorado|Connecticut|Deleware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Montana|Missouri|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West Virginia|Wisconsin|Wyoming|Ireland|France|England|Scotland|Wales|Holland|Spain|Germany';
	my $He_list = 'attended|became|commenced|completed|continued|declined|did|died|engaged|entered|established|graduated|is|journeyed|left|lived|lives|moved|owned|owns|participated|pursued|received|remained|remains|represented|represents|resigned|resumed|retired|returned|served|settled|signed|studied|successfully|taught|unsuccessfully|was|went|worked|works';
	my $Hewas_list = 'a|an|admitted|affiliated|appointed|assigned|author|discharged|editor|educated|employed|engaged|entombed|impeached|interred|interested|not|one|owner|promoted|publisher|reelected|re-elected|reinterred';
	my $Servedas_list = 'Court|Democratic|Republican|adjutant|aide|assistant|associate|businessman|businesswoman|captain|chair|chairman|clerk|collector|colonel|commissioner|defense|delegate|director|district|general|governor|inspector|judge|justice|lieutenant|magistrate|master|mayor|[mM]ember|naval|overseer|president|presidential|proprietor|prosecuting|solicitor|special|staff|vice|war';
	
	# Connect to the URL
	my $ua = new LWP::UserAgent;
	$ua->agent("Mozilla/6.0");  
	my $req = new HTTP::Request GET => $url;
	my $res = $ua->request($req);
	$res->is_success or die "Could not get content";
	
	# Get the content 
	my $content = $res->content;
	$content =~ s/^.*<P><FONT SIZE=4 COLOR=\"\#800040\">([^<]*), ?<\/FONT>([^<]*)<\/(TD|P)>.*$/$2/s;  # Just the main text (minus name)
	my $reversedname = $1;
	$content =~ s/\n//sg;  # as a single line
	
	# Parse name
	$reversedname =~ s/\s+/ /g;
	$reversedname =~ m/^([^,]*), ([^,]*)(, .*)?$/;
	my $firstname = $2;
	my $lastname = $1;
	my $suffix = $3;
	#die ">$foundname<  =>  >$foundfirstname< >$foundlastname< >$foundsuffix<\n";
	$lastname =~ s/(\w+)/\u\L$1/g;
	$reversedname = "$lastname, $firstname$suffix";
	my $fullname = "$firstname $lastname$suffix";
	
	# Do universal search & replaces
	$content =~ s/\s+/ /g; #take out dbl spaces;
	$content = unabbreviate_states($content); #expand all state names
	$content = link_cities_from_pattern($content); 
	$content = link_dates_from_pattern($content);
	$content = link_colleges_from_pattern($content);
	$content = replace_recognized_tokens($content);
	
	# split into individual lines
	my @lines = split(/; /, $content);  
	foreach my $line (@lines) { $line =~ s/^ // }  #take out leading space (if there)
	
	# Set up initial variables
	my $familyinfo = '';
	my $iswas = 'is';
	my $initial_description = '';
	my $birthdeath = 'unknown birth and death';
	my $birth = '';
	my $birthyear = '';
	my $death = '';
	my $deathyear = '';
	my $body = '';
	my %cats = (); # for categories like "Senator from Kentucky"
	
	# line 1. First off, does it start with " (son of . . .), " or something similar?
	# e.g. brother of John Fitzgerald Kennedy and Robert Francis Kennedy, grandson of John Francis Fitzgerald
	my $line = shift(@lines);
	
	if ($line =~ m/^\(([^)]*)\)/) {
		$familyinfo = $1;
		$line =~ s/^\([^)]*\), (.*)$/$1/;	
		
		$familyinfo =~ s/of ([^,]*),/of [[$1]],/g;
		$familyinfo =~ s/of ([^,]*)$/of [[$1]]/g;
		$familyinfo =~ s/([^],]) and /$1]] and [[/g;	
	}
	
	# Now, make line1 into the initial description, and pick categories.
	$initial_description = $line;
	
	while ($initial_description =~ m/(a Senator and a Representative|a Representative and a Senator) from ($states)/g) {
		#senator and rep
		$cats{"[[Category:United States Senators from $2]]"} = $2;
		$cats{"[[Category:Members of the United States House of Representatives from $2]]"} = $2;
	}
	
	while ($initial_description =~ m/Senator from ($states)/g) {
		$cats{"[[Category:United States Senators from $1]]"} = $1;
	}
	
	while ($initial_description =~ m/Representative from ($states)/g) {
		$cats{"[[Category:Members of the United States House of Representatives from $1]]"} = $1;
	}
	
	$initial_description =~ s/(Territory of )?($states)/[[$1$2]]/g;
	$initial_description =~ s/Senator/[[United States Senate|U.S. Senator]]/g;
	$initial_description =~ s/Representative/[[United States House of Representatives|U.S. Representative]]/g;
	
	# Next line: look for birth place and date.
	my $line = shift(@lines);

	if ($line =~ m/(born|Born)/) {
		if ($line =~ m/^(.*), in (\d+)$/) {
			$birthyear = $2;
			$birth = $2;
			$line = $1;
		} elsif ($line =~ m/^(.*), about (\d+)$/) {
			$birth = "ca. $2";
			$birthyear = $2;
			$line = $1
		} elsif ($line =~ m/^(.*?)(?:,)? (?:on )?(\[\[\w* \d+\]\], \[\[(\d+)\]\])$/) {
			$birth = $2;
			$birthyear = $3;
			$line = $1;
		} elsif ($line =~ m/^(.*), birth date (unknown)/) {
			$birth = $2;
			$line = $1;
		} else {
			$birth = 'unknown';
		}
		
		if ($line =~ s/^(was |probably )?born/Born/) {
			$body .= "$line, $lastname";
		} elsif ($line eq 'birth date unknown') {
			$body = $lastname;
		} else {
			die "I didn't expect: $line";
		}
	} else {
		$birth = 'unknown';
		$body = prepend_line($lastname, $lastname, $line);
	}

	# Next line. . .
	my $line = shift(@lines);
	$line = prepend_line('', $lastname, $line);
	$body .= $line;
	
	# Subsequent lines. . .
	while ($line = shift(@lines)) {
		if ($line eq 'birth date unknown') {
			$birth = 'unknown';
			$birthyear = '';
			next;
		} 
		
		if ($line =~ m/^[dD]eath date unknown\.? ?$/) {
			$death = 'unknown';
			$deathyear = '';
			$iswas = 'was';
			next;
		} 
			
		$line = prepend_line($pronoun, $lastname, $line);
		
		# look for death
		if ($line =~ m/(died|death(?! of)).*(\d\d\d\d)/) {
			$deathyear = $2;
			$death = $deathyear;
			$iswas = 'was';
			
			#TODO - change this to ignore "death of", check against http://bioguide.congress.gov/scripts/biodisplay.pl?index=A000022
			if ($line =~ m/(died|death(?! of)).*(\[\[($months) \d+\]\], \[\[\d\d\d\d\]\])/) {
				$death = $2;
			}
		}
	
		$body .= $line;
	}
	
	# Finalize Initial description.
	if ($birth) {
		if ($death) {
			$birthdeath = "$birth - $death";
			if ($birthdeath eq 'unknown - unknown') { $birthdeath = 'birth and death dates unknown'; }
		} else {
			if ($birth eq 'unknown') {
				$birthdeath = 'unknown date of birth';
			} else {
				$birthdeath = "born $birth";
			}
		}
	}
	
	my $boilerplate = "<!" . "-- This article was automatically created by [[User:polbot]] from $url. The prose may be stilted, and there may be grammatical and Wikification errors. Please improve in any way you see fit. --" . ">";
	$initial_description = "$boilerplate'''" . $fullname . "''' ($birthdeath) $iswas " . $initial_description;
	if ($familyinfo) {
		$initial_description .= ", " . $familyinfo;
	}
		
	# Add ending stuff
	$url =~ m/^.*=(.*)$/;
	my $ending_stuff = "==Source==\n{{CongBio|$1}}\n\n{{DEFAULTSORT:$reversedname}}\n";
	
	if ($birthyear) {
		$cats{"[[Category:$birthyear births]]"} = 'a'
		#$ending_stuff .= "[[Category:$birthyear births]]\n";
	} else {
		$cats{"[[Category:Year of birth unknown]]"} = 'a'
		#$ending_stuff .= "\n";
	}
	if ($iswas eq 'is') {
		$cats{"[[Category:Living people]]"} = 'a'
		#$ending_stuff .= "\n";
	} elsif ($death =~ m/\d\d\d\d/) {
		$cats{"[[Category:$deathyear deaths]]"} = 'a'
		#$ending_stuff .= "\n";
	} else {
		$cats{"[[Category:Year of death unknown]]"} = 'a'
		#$ending_stuff .= "\n";
	}
	$ending_stuff .= join("\n", sort keys %cats);

	
	
	# Done!
	$body = "$initial_description.\n\n$body\n$ending_stuff";
	return $body;
		
	# ===================================================================================================
	# ====================   Inner subs   ===============================================================
	# ===================================================================================================
	
	sub prepend_line
	{
		my $starter = shift;
		my $lastname = shift;
		my $line = shift;
		
		my $analyzeline = $line;
		
		# If the line starts with these, skip them.
		$analyzeline =~ s/^after the war//;
		$analyzeline =~ s/^again//;
		$analyzeline =~ s/^also//;
		$analyzeline =~ s/^originally//;
		$analyzeline =~ s/^several times//;
		$analyzeline =~ s/^soon afterward//;
		$analyzeline =~ s/^subsequently//;
		
		#Get 
		my ($initchar) = ($analyzeline =~ m/(.)/);
		my ($initword) = ($analyzeline =~ m/(\w+)/);
	
		if ($initchar eq '[') {
			$line = "$starter was in the $line.\n";
		} elsif ($initword =~ /^(successful|lawyer|teacher)$/) {
			$line = "$starter was a $line.\n";
		} elsif ($initword eq 'unsuccessful') {
			$line = "$starter was an $line.\n";
		} elsif ($initword eq 'elected') {
			$line = "\n$lastname was $line.\n";
		} elsif ($initword =~ m/^($He_list)$/) {
			$line = "$starter $line.\n";
		} elsif ($initword =~ m/^($Hewas_list)$/) {
			$line = "$starter was $line.\n";
		} elsif ($initword =~ m/^($Servedas_list)$/) {
			$line = "$starter served as $line.\n";
		} elsif ($initword =~ /^(re)?interment$/) {
			$line =~ s/^(re)?interment/$starter was $1interred/;
			$line = "$line.\n";
			$iswas = 'was';
		} else {
			$line =~ s/^([a-z])/\U$1/;
			$line = "<!" . "-- A grammar fix may be needed here. --" . ">$line.\n";
		}
		
		# clean up
		$line =~ s/(\.? \.|\. )$/./;
		return $line;
	}
	
	sub replace_recognized_tokens
	{
		my $content = shift;
		
		# links

		$content =~ s/Amherst College/[[Amherst College]]/g;
		$content =~ s/Civil War/[[American Civil War|Civil War]]/g;
		$content =~ s/Confederate Army/[[Confederate States Army]]/g;
		$content =~ s/Confederate States of America/[[Confederate States of America]]/g;
		$content =~ s/Constitution of the United States/[[United States Constitution|Constitution of the United States]]/g;
		$content =~ s/Democratic National Committee/[[Democratic National Committee]]/g;
		$content =~ s/Democratic Party/[[Democratic Party (United States)|Democratic Party]]/g;
		$content =~ s/Democratic-Republican Party/[[Democratic-Republican Party (United States)|Democratic-Republican Party]]/g;
		$content =~ s/Democratic Republican Party/[[Democratic-Republican Party (United States)|Democratic Republican Party]]/g;
		$content =~ s/Department of Defense/[[United States Department of Defense|Department of Defense]]/g;
		$content =~ s/Department of War/[[United States Department of War|Department of War]]/g;
		$content =~ s/Eton College/[[Eton College]]/g;
		$content =~ s/Federalist Party/[[Federalist Party (United States)|Federalist Party]]/g;		
		$content =~ s/Free-Soil Party/[[Free Soil Party|Free-Soil Party]]/g;
		$content =~ s/Harvard College/[[Harvard College]]/g;
		$content =~ s/justice of the peace/[[Justice of the Peace]]/g;
		$content =~ s/Opposition Party/[[Opposition Party (United States)|Opposition Party]]/g;
		$content =~ s/Republican National Committee/[[Republican National Committee]]/g;
		$content =~ s/Revolutionary War/[[American Revolutionary War|Revolutionary War]]/g;
		$content =~ s/Union Army/[[Union Army]]/g;
		$content =~ s/Union College/[[Union College]]/g;
		$content =~ s/United States Air Force/[[United States Air Force]]/g;
		$content =~ s/United States Army Medical Corps/[[Army Medical Department (United States)|United States Army Medical Corps]]/g;
		$content =~ s/United States Army Reserve/[[United States Army Reserve]]/g;
		$content =~ s/United States House of Representatives/[[United States House of Representatives]]/g;
		$content =~ s/United States Marine Corps/[[United States Marine Corps]]/g;
		$content =~ s/United States Marines/[[United States Marine Corps]]/g;
		$content =~ s/United States Navy/[[United States Navy]]/g;
		$content =~ s/United States Representative/[[United States Representative]]/g;
		$content =~ s/United States Senate/[[United States Senate]]/g;
		$content =~ s/United States Senator/[[United States Senator]]/g;
		$content =~ s/United States Supreme Court/[[Supreme Court of the United States|United States Supreme Court]]/g;
		$content =~ s/United States Treasury Department/[[United States Treasury Department]]/g;
		$content =~ s/(Vice )?President of the United States/[[$1President of the United States]]/g;
		$content =~ s/Washington, D.C./[[Washington, D.C.]]/g;
		$content =~ s/William and Mary College/[[William and Mary College]]/g;
		$content =~ s/Yale College/[[Yale College]]/g;	

		$content =~ s/Republican Party/[[Republican Party (United States)|Republican Party]]/g;
		$content =~ s/United States Army/[[United States Army]]/g;
		$content =~ s/as a Democrat/as a [[Democratic Party (United States)|Democrat]]/g;
		$content =~ s/as a Federalist/as a [[Federalist Party (United States)|Federalist]]/g;
		$content =~ s/as a Republican/as a [[Republican Party (United States)|Republican]]/g;
		$content =~ s/as a Whig/as a [[Whig Party (United States)|Whig]]/g;
		
		$content =~ s/($states) (state )?senate/[[$1 Senate]]/g;
		$content =~ s/($states) (state )?house of representatives/[[$1 House of Representatives]]/g;

		# grammar-related replacements
		$content =~ s/graduated, /graduated from /g;
		$content =~ s/lawyer, private/lawyer in private/g;
		$content =~ s/, (\d\d\d\d) ?- ?(\d\d\d\d)/ from $1 to $2/g;
		$content =~ s/\(([^)]*)\;/($1, and/g;
		$content =~ s/(member|chairman|chair), /$1 of the /g;
		$content =~ s/\&\#146\;/'/g;
		$content =~ s/\&\#14[78]\;/"/g;
		
		return $content;
	}
		
	
	sub link_colleges_from_pattern
	{
		my $content = shift;
		
		# "Something University"
		$content =~ s/(([A-Z][a-z]+ (and )?)*[A-Z][a-z]+ (University|Academy))/\[\[$1\]\]/g;
		
		# "University of Something"
		$content =~ s/(University of [A-Z][a-z]+( (at )?[A-Z][a-z]+)*)/\[\[$1\]\]/g;
		
		return $content;
	}
	
	sub link_dates_from_pattern
	{
		my $content = shift;
		
		$content =~ s/($months) (\d+), *(\d\d\d\d)/[[$1 $2]], [[$3]]/g;
		
		return $content;
	}
	
	sub link_cities_from_pattern
	{
		my $content = shift;
		
		#prep City, State (or prep County, State)
		$content =~ s/ ($preps) ([A-Z][a-z]*( [A-Z][a-z]*)*, ($states))/ $1 [[$2]]/g;
		
		#prep City, Something County, State
		$content =~ s/ ($preps) ([A-Z][a-z]*( [A-Z][a-z]*)*),( [A-Z][a-z]*)* County, (($states))/ $1 [[$2, $5]]/g;
		
		#, City, Something County, State
		$content =~ s/, ([A-Z][a-z]*( [A-Z][a-z]*)*),( [A-Z][a-z]*)* County, (($states))/, [[$1, $4]]/g;
		
		#, Something, State
		$content =~ s/, ([A-Z][a-z]*( [A-Z][a-z]*)*, ($states))/, [[$1]]/g;
		
		return $content;
	}
	
	sub unabbreviate_states 
	{
		my $content = shift;
		
		$content =~ s/Ala\./Alabama/g;
		$content =~ s/Ariz\./Arizona/g;
		$content =~ s/Ark\./Arkansas/g;
		$content =~ s/Calif\./California/g;
		$content =~ s/Colo\./Colorado/g;
		$content =~ s/Conn\./Connecticut/g;
		$content =~ s/Del\./Delaware/g;
		$content =~ s/Fla\./Florida/g;
		$content =~ s/Ga\./Georgia/g;
		$content =~ s/Ill\./Illinois/g;
		$content =~ s/Ind\./Indiana/g;
		$content =~ s/Kans\./Kansas/g;
		$content =~ s/Ky\./Kentucky/g;
		$content =~ s/La\./Louisiana/g;
		$content =~ s/Md\./Maryland/g;
		$content =~ s/Mass\./Massachusetts/g;
		$content =~ s/Mich\./Michigan/g;
		$content =~ s/Minn\./Minnesota/g;
		$content =~ s/Miss\./Mississippi/g;
		$content =~ s/Mo\./Missouri/g;
		$content =~ s/Mont\./Montana/g;
		$content =~ s/Nebr\./Nebraska/g;
		$content =~ s/Nev\./Nevada/g;
		$content =~ s/N\.H\./New Hampshire/g;
		$content =~ s/N\.J\./New Jersey/g;
		$content =~ s/N\.M\./New Mexico/g;
		$content =~ s/N\.Y\./New York/g;
		$content =~ s/N\.C\./North Carolina/g;
		$content =~ s/N\.D\./North Dakota/g;
		$content =~ s/Okla\./Oklahoma/g;
		$content =~ s/Ore\./Oregon/g;
		$content =~ s/Pa\./Pennsylvania/g;
		$content =~ s/R\.I\./Rhode Island/g;
		$content =~ s/S\.C\./South Carolina/g;
		$content =~ s/S\.D\./South Dakota/g;
		$content =~ s/Tenn\./Tennessee/g;
		$content =~ s/Tex\./Texas/g;
		$content =~ s/Vt\./Vermont/g;
		$content =~ s/Va\./Virginia/g;
		$content =~ s/Wash\./Washington/g;
		$content =~ s/W\.Va\./West Virginia/g;
		$content =~ s/Wis\./Wisconsin/g;
		$content =~ s/Wyo\./Wyoming/g;
		
		return $content;
	}
}

# Here is an example for this sub's usage:
# $URL = Polbot::Get_URL_from_name("Mitch McConnell");

sub Get_URL_from_name
{
	my $article_name = shift;
	
	my @URLs = ();
	my $ErrMsg;
	my $fname;
	my $lname;

	$article_name =~ s/ \(.*\)//g;  # Take out anything parenthesized.
	
	if ($article_name =~ m/^(.*) ([^ ]*)(, Jr.|, Sr.| II| III)$/) {
		$fname = $1 . $3;
		$lname = $2;
	} elsif ($article_name =~ m/^(.*) ([^ ]*)$/) {
		$fname = $1;
		$lname = $2;
	} else {
		return "Malformed article name '$article_name'";
	}
	
	@URLs = Get_matching_URLs($fname, $lname);
	my $nummatches = scalar(@URLs);
	
	if ($nummatches eq 1) {
		return  $URLs[0];
	} elsif ($nummatches > 1) {
		return "Multiple hits for '$lname, $fname'.";
	}

	$ErrMsg = "No hits for '$lname, $fname'.";

	# Take off the suffix
	if ($fname =~ s/(, Jr\.|, Sr\.| II| III)$//) {
		@URLs = Get_matching_URLs($fname, $lname);
		my $nummatches = scalar(@URLs);
		
		if ($nummatches eq 1) {
			return  $URLs[0];
		} elsif ($nummatches > 1) {
			$ErrMsg .= " Multiple hits for '$lname, $fname'.";
			return $ErrMsg;
		}
		$ErrMsg .= " No hits for '$lname, $fname'.";
	}
	
	# Try like "C. Everett Coop"
	if ($fname =~ s/^.\. //) {
		@URLs = Get_matching_URLs($fname, $lname);
		my $nummatches = scalar(@URLs);
		
		if ($nummatches eq 1) {
			return  $URLs[0];
		} elsif ($nummatches > 1) {
			$ErrMsg .= " Multiple hits for '$lname, $fname'.";
			return $ErrMsg;
		}
		$ErrMsg .= " No hits for '$lname, $fname'.";
	}
	
	# Try like "John Q. Adams"
	if ($fname =~ s/\..*$//) {
		@URLs = Get_matching_URLs($fname, $lname);
		my $nummatches = scalar(@URLs);
		
		if ($nummatches eq 1) {
			return  $URLs[0];
		} elsif ($nummatches > 1) {
			$ErrMsg .= " Multiple hits for '$lname, $fname'.";
			return $ErrMsg;
		}
		$ErrMsg .= " No hits for '$lname, $fname'.";
	}
	
	return $ErrMsg;
}
	
sub Get_matching_URLs
{
	my $firstname = shift;
	my $lastname = shift;
	
	my $url = 'http://bioguide.congress.gov/biosearch/biosearch1.asp';
	
	my $ua = LWP::UserAgent->new;
	$ua->agent("Mozilla/6.0");
	my @links = ();
		
	my $res = $ua->post($url, ['lastname' => $lastname, 'firstname' => $firstname]);
	if ($res->is_success) {
		my $content = $res->content;
		@links = ($content =~ m/<td><A HREF=\"([^"]*)\">/g);
	} else {		
		print "could not connect, lastname = $lastname, firstname=$firstname"
	}

	return @links;
}

sub fix_dates {
	my $txt = shift;
	
    # century without AD,BC etc
    $txt =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi;
    $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi;
    $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi;
    $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\]\]/$1 centuries/gi;
    # century with AD,BC etc
    $txt =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi;
    $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi;
    $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\s(AD|BC|CE|BCE)\]\]/$1 centuries $2/gi;
    $txt =~ s/(\d(?:st|nd|rd|th))[ \-]Century/$1 century/gi;

    # piped decades and years
    $txt =~ s/\[\[(\d{1,4}\'?s)\]\]/$1/gi;
    $txt =~ s/\[\[(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi;
    $txt =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4})\]\]/$1/gi;
    $txt =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi;
    $txt =~ s/\[\[\d{1,4}s?\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi;
    $txt =~ s/\[\[\d{1,4}s?\|(\d{1,2}s?)\]\]/$1/gi;

    # months
    $txt =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December)\]\]/$1/gi;
    $txt =~ s/\[\[January\|(Jan)\]\]/$1/gi;
    $txt =~ s/\[\[February\|(Feb)\]\]/$1/gi;
    $txt =~ s/\[\[March\|(Mar)\]\]/$1/gi;
    $txt =~ s/\[\[April\|(Apr)\]\]/$1/gi;
    $txt =~ s/\[\[May\|(May)\]\]/$1/gi;
    $txt =~ s/\[\[June\|(Jun)\]\]/$1/gi;
    $txt =~ s/\[\[July\|(Jul)\]\]/$1/gi;
    $txt =~ s/\[\[August\|(Aug)\]\]/$1/gi;
    $txt =~ s/\[\[September\|(Sep)\]\]/$1/gi;
    $txt =~ s/\[\[October\|(Oct)\]\]/$1/gi;
    $txt =~ s/\[\[November\|(Nov)\]\]/$1/gi;
    $txt =~ s/\[\[December\|(Dec)\]\]/$1/gi;

    #month+year
    $txt =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d{3,4})\]\]/$1/gi;

    #Month+day_number "March 7th" -> "March 7"
    $txt =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December) (\d?\d)(?:th|st|nd|rd)\]\]/\[\[$1 $2\]\]/gi;
    $txt =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](?:th|st|nd|rd)/\[\[$1\]\]/gi;
    $txt =~ s/\[\[(\d?\d)(?:th|st|nd|rd) (January|February|March|April|May|June|July|August|September|October|November|December)\]\]/\[\[$1 $2\]\]/gi;

    #Month+day_number piped into number. Preferences do not work. They don't work in sequence because digits in the two dates must be adjacent
    $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with ndash or mdash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with slash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;

    $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with ndash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with slash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;

    $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with ndash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with slash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;

    $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with ndash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with slash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;

    $txt =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1/gi;
    $txt =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2})\]\]/$1/gi;

    $txt =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2})\]\]/$1/gi;


    # solitary day_numbers
    $txt =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi;
    $txt =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi;
    $txt =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi;

    # days of the week in full. Optional plurals
    $txt =~ s/\[\[(Mondays?|Tuesdays?|Wednesdays?|Thursdays?|Fridays?|Saturdays?|Sundays?)\]\]/$1/gi;
    # days of the week abbreviated. Leave out 'Sun' as potentially valid link to the Sun. Leave out 'SAT' in upper case as potential link to 'Scholastic achievement/aptitude test'.
    $txt =~ s/\[\[(Mon|Tue|Tues|Wed|Thu|Thur|Thurs|Fri)\]\]/$1/gi;
    $txt =~ s/\[\[(Sat)\]\]/$1/g;
    $txt =~ s/\[\[Mondays?\|(Mondays?)\]\]/$1/gi;
    $txt =~ s/\[\[Tuesdays?\|(Tuesdays?)\]\]/$1/gi;
    $txt =~ s/\[\[Wednesdays?\|(Wednesdays?)\]\]/$1/gi;
    $txt =~ s/\[\[Thursdays?\|(Thursdays?)\]\]/$1/gi;
    $txt =~ s/\[\[Fridays?\|(Fridays?)\]\]/$1/gi;
    $txt =~ s/\[\[Saturdays?\|(Saturdays?)\]\]/$1/gi;
    $txt =~ s/\[\[Sundays?\|(Sundays?)\]\]/$1/gi;

    #4 digit years piped into 2
    $txt =~ s/\[\[\d{1,4}\|(\d{1,2})\]\]/$1/gi;

    #year: examine characters in link on left for date, examine characters in link on right for date
    $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi;
    #year pair: examine characters in link on left for date, examine characters in link on right for date
    $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc\s\-]))/$1$2$3$4$5/gi;

    #year: examine characters in link on left for date, avoid links on right
    $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi;
    #year pair: examine characters in link on left for date, avoid links on right
    $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi;

    #year: check for line-ends, text on left, avoid links on right. Run twice to deal better with lists.
    $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi;
    $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi;
    #year pair: check for line-ends, text on left, avoid links on right
    $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3$4$5/gi;

    #year: avoid links on left, examine characters in link on right for date
    $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi;
    #year pair: avoid links on left, examine characters in link on right for date
    $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3$4$5/gi;

    #year:avoid links on left, text on right
    $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3/gi;
    #year pair: avoid links on left, text on right
    $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3$4$5/gi;

    #year:text on left, text on right
    $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3/gi;
    #year pair: avoid links on left, text on right
    $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3$4$5/gi;

    #year:avoid links on left, hyphen but no digits (to avoid ISO date) in link on right. Currently suspended because it isn't fully tested.
    #$txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[[^\d])/$1$2$3/gi;

    #year:avoid links on both sides
    $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi;
    #year pair: avoid links on both sides
    $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi;

    #'present'
    $txt =~ s/\[\[Present \(time\)\|(Present)\]\]/$1/gi;

    #Eliminate 'surprise links' also known as 'easter egg links'
    $txt =~ s/\[\[\d{1,4}s?\sin\s[^\|]{1,30}\|(\d{1,4}s?)\]\]/$1/gi;

	return $txt;
}

sub replace_unlinked_tokens
{
	my $content = shift;
	
	# links

	$content =~ s/([^[|:])Amherst College/$1\[\[Amherst College\]\]/;
	$content =~ s/([^[|:])Confederate Army/$1\[\[Confederate States Army\]\]/;
	$content =~ s/([^[|:])Constitution of the United States/$1\[\[United States Constitution|Constitution of the United States\]\]/;
	$content =~ s/([^[|:])Democratic National Committee/$1\[\[Democratic National Committee\]\]/;
	$content =~ s/([^[|:])Democratic-Republican Party/$1\[\[Democratic-Republican Party (United States)|Democratic-Republican Party\]\]/;
	$content =~ s/([^[|:])Democratic Republican Party/$1\[\[Democratic-Republican Party (United States)|Democratic Republican Party\]\]/;
	$content =~ s/Department of Defense([^]|])/\[\[United States Department of Defense|Department of Defense\]\]$1/;
	$content =~ s/Department of War([^]|])/\[\[United States Department of War|Department of War\]\]$1/;
	$content =~ s/([^[|:])Eton College/$1\[\[Eton College\]\]/;
	$content =~ s/([^[|:])Free-Soil Party/$1\[\[Free Soil Party|Free-Soil Party\]\]/;
	$content =~ s/([^[|:])Harvard College/$1\[\[Harvard College\]\]/;
	$content =~ s/([^[|:])Republican National Committee/$1\[\[Republican National Committee\]\]/;
	$content =~ s/([^[|:])Union Army/$1\[\[Union Army\]\]/;
	$content =~ s/([^[|:])Union College/$1\[\[Union College\]\]/;
	$content =~ s/([^[|:])United States Army Medical Corps/$1\[\[Army Medical Department (United States)|United States Army Medical Corps\]\]/;
	$content =~ s/([^[|:])United States Army Reserve/$1\[\[United States Army Reserve\]\]/;
	$content =~ s/([^[|:])United States Treasury Department/$1\[\[United States Treasury Department\]\]/;
	$content =~ s/([^[|:])Washington, D\.C\./$1\[\[Washington, D.C.\]\]/;
	$content =~ s/([^[|:])William and Mary College/$1\[\[William and Mary College\]\]/;
	$content =~ s/([^[|:])Yale College/$1\[\[Yale College\]\]/;	

	$content =~ s/as a Democrat/as a \[\[Democratic Party (United States)|Democrat\]\]/;
	$content =~ s/as a Federalist/as a \[\[Federalist Party (United States)|Federalist\]\]/;
	$content =~ s/as a Republican/as a \[\[Republican Party (United States)|Republican\]\]/;
	$content =~ s/as a Whig/as a \[\[Whig Party (United States)|Whig\]\]/;
	
	# grammar-related replacements
	$content =~ s/graduated, /graduated from /g;
	$content =~ s/lawyer, private/lawyer in private/g;
	$content =~ s/, (\d\d\d\d) ?- ?(\d\d\d\d)/ from $1 to $2/g;
	$content =~ s/(member|chairman|chair), /$1 of the /g;
	$content =~ s/\&\#146\;/'/g;
	$content =~ s/\&\#14[78]\;/"/g;
	
	return $content;
}

1;