package Polbot; use strict; use LWP::UserAgent; # Here is an example for this sub's usage: # # my $url1 = 'http://bioguide.congress.gov/scripts/biodisplay.pl?index=H000671'; # print Polbot::bio2wiki($url1); sub bio2wiki { my $url = shift; # Constants my $pronoun = 'He'; #Unfortunately, there is no way to tell if the person is male or female from the bioguide. I hate assuming male here, but what can you do? my $preps = 'in|near|to|at|of'; my $months = 'January|February|March|April|May|June|July|August|September|October|November|December'; my $states = 'Alaska|Alabama|Arkansas|Arizona|California|Colorado|Connecticut|Deleware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Montana|Missouri|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West Virginia|Wisconsin|Wyoming|Ireland|France|England|Scotland|Wales|Holland|Spain|Germany'; my $He_list = 'attended|became|commenced|completed|continued|declined|did|died|engaged|entered|established|graduated|is|journeyed|left|lived|lives|moved|owned|owns|participated|pursued|received|remained|remains|represented|represents|resigned|resumed|retired|returned|served|settled|signed|studied|successfully|taught|unsuccessfully|was|went|worked|works'; my $Hewas_list = 'a|an|admitted|affiliated|appointed|assigned|author|discharged|editor|educated|employed|engaged|entombed|impeached|interred|interested|not|one|owner|promoted|publisher|reelected|re-elected|reinterred'; my $Servedas_list = 'Court|Democratic|Republican|adjutant|aide|assistant|associate|businessman|businesswoman|captain|chair|chairman|clerk|collector|colonel|commissioner|defense|delegate|director|district|general|governor|inspector|judge|justice|lieutenant|magistrate|master|mayor|[mM]ember|naval|overseer|president|presidential|proprietor|prosecuting|solicitor|special|staff|vice|war'; # Connect to the URL my $ua = new LWP::UserAgent; $ua->agent("Mozilla/6.0"); my $req = new HTTP::Request GET => $url; my $res = $ua->request($req); $res->is_success or die "Could not get content"; # Get the content my $content = $res->content; $content =~ s/^.*<P><FONT SIZE=4 COLOR=\"\#800040\">([^<]*), ?<\/FONT>([^<]*)<\/(TD|P)>.*$/$2/s; # Just the main text (minus name) my $reversedname = $1; $content =~ s/\n//sg; # as a single line # Parse name $reversedname =~ s/\s+/ /g; $reversedname =~ m/^([^,]*), ([^,]*)(, .*)?$/; my $firstname = $2; my $lastname = $1; my $suffix = $3; #die ">$foundname< => >$foundfirstname< >$foundlastname< >$foundsuffix<\n"; $lastname =~ s/(\w+)/\u\L$1/g; $reversedname = "$lastname, $firstname$suffix"; my $fullname = "$firstname $lastname$suffix"; # Do universal search & replaces $content =~ s/\s+/ /g; #take out dbl spaces; $content = unabbreviate_states($content); #expand all state names $content = link_cities_from_pattern($content); $content = link_dates_from_pattern($content); $content = link_colleges_from_pattern($content); $content = replace_recognized_tokens($content); # split into individual lines my @lines = split(/; /, $content); foreach my $line (@lines) { $line =~ s/^ // } #take out leading space (if there) # Set up initial variables my $familyinfo = ''; my $iswas = 'is'; my $initial_description = ''; my $birthdeath = 'unknown birth and death'; my $birth = ''; my $birthyear = ''; my $death = ''; my $deathyear = ''; my $body = ''; my %cats = (); # for categories like "Senator from Kentucky" # line 1. First off, does it start with " (son of . . .), " or something similar? # e.g. brother of John Fitzgerald Kennedy and Robert Francis Kennedy, grandson of John Francis Fitzgerald my $line = shift(@lines); if ($line =~ m/^\(([^)]*)\)/) { $familyinfo = $1; $line =~ s/^\([^)]*\), (.*)$/$1/; $familyinfo =~ s/of ([^,]*),/of [[$1]],/g; $familyinfo =~ s/of ([^,]*)$/of [[$1]]/g; $familyinfo =~ s/([^],]) and /$1]] and [[/g; } # Now, make line1 into the initial description, and pick categories. $initial_description = $line; while ($initial_description =~ m/(a Senator and a Representative|a Representative and a Senator) from ($states)/g) { #senator and rep $cats{"[[Category:United States Senators from $2]]"} = $2; $cats{"[[Category:Members of the United States House of Representatives from $2]]"} = $2; } while ($initial_description =~ m/Senator from ($states)/g) { $cats{"[[Category:United States Senators from $1]]"} = $1; } while ($initial_description =~ m/Representative from ($states)/g) { $cats{"[[Category:Members of the United States House of Representatives from $1]]"} = $1; } $initial_description =~ s/(Territory of )?($states)/[[$1$2]]/g; $initial_description =~ s/Senator/[[United States Senate|U.S. Senator]]/g; $initial_description =~ s/Representative/[[United States House of Representatives|U.S. Representative]]/g; # Next line: look for birth place and date. my $line = shift(@lines); if ($line =~ m/(born|Born)/) { if ($line =~ m/^(.*), in (\d+)$/) { $birthyear = $2; $birth = $2; $line = $1; } elsif ($line =~ m/^(.*), about (\d+)$/) { $birth = "ca. $2"; $birthyear = $2; $line = $1 } elsif ($line =~ m/^(.*?)(?:,)? (?:on )?(\[\[\w* \d+\]\], \[\[(\d+)\]\])$/) { $birth = $2; $birthyear = $3; $line = $1; } elsif ($line =~ m/^(.*), birth date (unknown)/) { $birth = $2; $line = $1; } else { $birth = 'unknown'; } if ($line =~ s/^(was |probably )?born/Born/) { $body .= "$line, $lastname"; } elsif ($line eq 'birth date unknown') { $body = $lastname; } else { die "I didn't expect: $line"; } } else { $birth = 'unknown'; $body = prepend_line($lastname, $lastname, $line); } # Next line. . . my $line = shift(@lines); $line = prepend_line('', $lastname, $line); $body .= $line; # Subsequent lines. . . while ($line = shift(@lines)) { if ($line eq 'birth date unknown') { $birth = 'unknown'; $birthyear = ''; next; } if ($line =~ m/^[dD]eath date unknown\.? ?$/) { $death = 'unknown'; $deathyear = ''; $iswas = 'was'; next; } $line = prepend_line($pronoun, $lastname, $line); # look for death if ($line =~ m/(died|death(?! of)).*(\d\d\d\d)/) { $deathyear = $2; $death = $deathyear; $iswas = 'was'; #TODO - change this to ignore "death of", check against http://bioguide.congress.gov/scripts/biodisplay.pl?index=A000022 if ($line =~ m/(died|death(?! of)).*(\[\[($months) \d+\]\], \[\[\d\d\d\d\]\])/) { $death = $2; } } $body .= $line; } # Finalize Initial description. if ($birth) { if ($death) { $birthdeath = "$birth - $death"; if ($birthdeath eq 'unknown - unknown') { $birthdeath = 'birth and death dates unknown'; } } else { if ($birth eq 'unknown') { $birthdeath = 'unknown date of birth'; } else { $birthdeath = "born $birth"; } } } my $boilerplate = "<!" . "-- This article was automatically created by [[User:polbot]] from $url. The prose may be stilted, and there may be grammatical and Wikification errors. Please improve in any way you see fit. --" . ">"; $initial_description = "$boilerplate'''" . $fullname . "''' ($birthdeath) $iswas " . $initial_description; if ($familyinfo) { $initial_description .= ", " . $familyinfo; } # Add ending stuff $url =~ m/^.*=(.*)$/; my $ending_stuff = "==Source==\n{{CongBio|$1}}\n\n{{DEFAULTSORT:$reversedname}}\n"; if ($birthyear) { $cats{"[[Category:$birthyear births]]"} = 'a' #$ending_stuff .= "[[Category:$birthyear births]]\n"; } else { $cats{"[[Category:Year of birth unknown]]"} = 'a' #$ending_stuff .= "\n"; } if ($iswas eq 'is') { $cats{"[[Category:Living people]]"} = 'a' #$ending_stuff .= "\n"; } elsif ($death =~ m/\d\d\d\d/) { $cats{"[[Category:$deathyear deaths]]"} = 'a' #$ending_stuff .= "\n"; } else { $cats{"[[Category:Year of death unknown]]"} = 'a' #$ending_stuff .= "\n"; } $ending_stuff .= join("\n", sort keys %cats); # Done! $body = "$initial_description.\n\n$body\n$ending_stuff"; return $body; # =================================================================================================== # ==================== Inner subs =============================================================== # =================================================================================================== sub prepend_line { my $starter = shift; my $lastname = shift; my $line = shift; my $analyzeline = $line; # If the line starts with these, skip them. $analyzeline =~ s/^after the war//; $analyzeline =~ s/^again//; $analyzeline =~ s/^also//; $analyzeline =~ s/^originally//; $analyzeline =~ s/^several times//; $analyzeline =~ s/^soon afterward//; $analyzeline =~ s/^subsequently//; #Get my ($initchar) = ($analyzeline =~ m/(.)/); my ($initword) = ($analyzeline =~ m/(\w+)/); if ($initchar eq '[') { $line = "$starter was in the $line.\n"; } elsif ($initword =~ /^(successful|lawyer|teacher)$/) { $line = "$starter was a $line.\n"; } elsif ($initword eq 'unsuccessful') { $line = "$starter was an $line.\n"; } elsif ($initword eq 'elected') { $line = "\n$lastname was $line.\n"; } elsif ($initword =~ m/^($He_list)$/) { $line = "$starter $line.\n"; } elsif ($initword =~ m/^($Hewas_list)$/) { $line = "$starter was $line.\n"; } elsif ($initword =~ m/^($Servedas_list)$/) { $line = "$starter served as $line.\n"; } elsif ($initword =~ /^(re)?interment$/) { $line =~ s/^(re)?interment/$starter was $1interred/; $line = "$line.\n"; $iswas = 'was'; } else { $line =~ s/^([a-z])/\U$1/; $line = "<!" . "-- A grammar fix may be needed here. --" . ">$line.\n"; } # clean up $line =~ s/(\.? \.|\. )$/./; return $line; } sub replace_recognized_tokens { my $content = shift; # links $content =~ s/Amherst College/[[Amherst College]]/g; $content =~ s/Civil War/[[American Civil War|Civil War]]/g; $content =~ s/Confederate Army/[[Confederate States Army]]/g; $content =~ s/Confederate States of America/[[Confederate States of America]]/g; $content =~ s/Constitution of the United States/[[United States Constitution|Constitution of the United States]]/g; $content =~ s/Democratic National Committee/[[Democratic National Committee]]/g; $content =~ s/Democratic Party/[[Democratic Party (United States)|Democratic Party]]/g; $content =~ s/Democratic-Republican Party/[[Democratic-Republican Party (United States)|Democratic-Republican Party]]/g; $content =~ s/Democratic Republican Party/[[Democratic-Republican Party (United States)|Democratic Republican Party]]/g; $content =~ s/Department of Defense/[[United States Department of Defense|Department of Defense]]/g; $content =~ s/Department of War/[[United States Department of War|Department of War]]/g; $content =~ s/Eton College/[[Eton College]]/g; $content =~ s/Federalist Party/[[Federalist Party (United States)|Federalist Party]]/g; $content =~ s/Free-Soil Party/[[Free Soil Party|Free-Soil Party]]/g; $content =~ s/Harvard College/[[Harvard College]]/g; $content =~ s/justice of the peace/[[Justice of the Peace]]/g; $content =~ s/Opposition Party/[[Opposition Party (United States)|Opposition Party]]/g; $content =~ s/Republican National Committee/[[Republican National Committee]]/g; $content =~ s/Revolutionary War/[[American Revolutionary War|Revolutionary War]]/g; $content =~ s/Union Army/[[Union Army]]/g; $content =~ s/Union College/[[Union College]]/g; $content =~ s/United States Air Force/[[United States Air Force]]/g; $content =~ s/United States Army Medical Corps/[[Army Medical Department (United States)|United States Army Medical Corps]]/g; $content =~ s/United States Army Reserve/[[United States Army Reserve]]/g; $content =~ s/United States House of Representatives/[[United States House of Representatives]]/g; $content =~ s/United States Marine Corps/[[United States Marine Corps]]/g; $content =~ s/United States Marines/[[United States Marine Corps]]/g; $content =~ s/United States Navy/[[United States Navy]]/g; $content =~ s/United States Representative/[[United States Representative]]/g; $content =~ s/United States Senate/[[United States Senate]]/g; $content =~ s/United States Senator/[[United States Senator]]/g; $content =~ s/United States Supreme Court/[[Supreme Court of the United States|United States Supreme Court]]/g; $content =~ s/United States Treasury Department/[[United States Treasury Department]]/g; $content =~ s/(Vice )?President of the United States/[[$1President of the United States]]/g; $content =~ s/Washington, D.C./[[Washington, D.C.]]/g; $content =~ s/William and Mary College/[[William and Mary College]]/g; $content =~ s/Yale College/[[Yale College]]/g; $content =~ s/Republican Party/[[Republican Party (United States)|Republican Party]]/g; $content =~ s/United States Army/[[United States Army]]/g; $content =~ s/as a Democrat/as a [[Democratic Party (United States)|Democrat]]/g; $content =~ s/as a Federalist/as a [[Federalist Party (United States)|Federalist]]/g; $content =~ s/as a Republican/as a [[Republican Party (United States)|Republican]]/g; $content =~ s/as a Whig/as a [[Whig Party (United States)|Whig]]/g; $content =~ s/($states) (state )?senate/[[$1 Senate]]/g; $content =~ s/($states) (state )?house of representatives/[[$1 House of Representatives]]/g; # grammar-related replacements $content =~ s/graduated, /graduated from /g; $content =~ s/lawyer, private/lawyer in private/g; $content =~ s/, (\d\d\d\d) ?- ?(\d\d\d\d)/ from $1 to $2/g; $content =~ s/\(([^)]*)\;/($1, and/g; $content =~ s/(member|chairman|chair), /$1 of the /g; $content =~ s/\&\#146\;/'/g; $content =~ s/\&\#14[78]\;/"/g; return $content; } sub link_colleges_from_pattern { my $content = shift; # "Something University" $content =~ s/(([A-Z][a-z]+ (and )?)*[A-Z][a-z]+ (University|Academy))/\[\[$1\]\]/g; # "University of Something" $content =~ s/(University of [A-Z][a-z]+( (at )?[A-Z][a-z]+)*)/\[\[$1\]\]/g; return $content; } sub link_dates_from_pattern { my $content = shift; $content =~ s/($months) (\d+), *(\d\d\d\d)/[[$1 $2]], [[$3]]/g; return $content; } sub link_cities_from_pattern { my $content = shift; #prep City, State (or prep County, State) $content =~ s/ ($preps) ([A-Z][a-z]*( [A-Z][a-z]*)*, ($states))/ $1 [[$2]]/g; #prep City, Something County, State $content =~ s/ ($preps) ([A-Z][a-z]*( [A-Z][a-z]*)*),( [A-Z][a-z]*)* County, (($states))/ $1 [[$2, $5]]/g; #, City, Something County, State $content =~ s/, ([A-Z][a-z]*( [A-Z][a-z]*)*),( [A-Z][a-z]*)* County, (($states))/, [[$1, $4]]/g; #, Something, State $content =~ s/, ([A-Z][a-z]*( [A-Z][a-z]*)*, ($states))/, [[$1]]/g; return $content; } sub unabbreviate_states { my $content = shift; $content =~ s/Ala\./Alabama/g; $content =~ s/Ariz\./Arizona/g; $content =~ s/Ark\./Arkansas/g; $content =~ s/Calif\./California/g; $content =~ s/Colo\./Colorado/g; $content =~ s/Conn\./Connecticut/g; $content =~ s/Del\./Delaware/g; $content =~ s/Fla\./Florida/g; $content =~ s/Ga\./Georgia/g; $content =~ s/Ill\./Illinois/g; $content =~ s/Ind\./Indiana/g; $content =~ s/Kans\./Kansas/g; $content =~ s/Ky\./Kentucky/g; $content =~ s/La\./Louisiana/g; $content =~ s/Md\./Maryland/g; $content =~ s/Mass\./Massachusetts/g; $content =~ s/Mich\./Michigan/g; $content =~ s/Minn\./Minnesota/g; $content =~ s/Miss\./Mississippi/g; $content =~ s/Mo\./Missouri/g; $content =~ s/Mont\./Montana/g; $content =~ s/Nebr\./Nebraska/g; $content =~ s/Nev\./Nevada/g; $content =~ s/N\.H\./New Hampshire/g; $content =~ s/N\.J\./New Jersey/g; $content =~ s/N\.M\./New Mexico/g; $content =~ s/N\.Y\./New York/g; $content =~ s/N\.C\./North Carolina/g; $content =~ s/N\.D\./North Dakota/g; $content =~ s/Okla\./Oklahoma/g; $content =~ s/Ore\./Oregon/g; $content =~ s/Pa\./Pennsylvania/g; $content =~ s/R\.I\./Rhode Island/g; $content =~ s/S\.C\./South Carolina/g; $content =~ s/S\.D\./South Dakota/g; $content =~ s/Tenn\./Tennessee/g; $content =~ s/Tex\./Texas/g; $content =~ s/Vt\./Vermont/g; $content =~ s/Va\./Virginia/g; $content =~ s/Wash\./Washington/g; $content =~ s/W\.Va\./West Virginia/g; $content =~ s/Wis\./Wisconsin/g; $content =~ s/Wyo\./Wyoming/g; return $content; } } # Here is an example for this sub's usage: # $URL = Polbot::Get_URL_from_name("Mitch McConnell"); sub Get_URL_from_name { my $article_name = shift; my @URLs = (); my $ErrMsg; my $fname; my $lname; $article_name =~ s/ \(.*\)//g; # Take out anything parenthesized. if ($article_name =~ m/^(.*) ([^ ]*)(, Jr.|, Sr.| II| III)$/) { $fname = $1 . $3; $lname = $2; } elsif ($article_name =~ m/^(.*) ([^ ]*)$/) { $fname = $1; $lname = $2; } else { return "Malformed article name '$article_name'"; } @URLs = Get_matching_URLs($fname, $lname); my $nummatches = scalar(@URLs); if ($nummatches eq 1) { return $URLs[0]; } elsif ($nummatches > 1) { return "Multiple hits for '$lname, $fname'."; } $ErrMsg = "No hits for '$lname, $fname'."; # Take off the suffix if ($fname =~ s/(, Jr\.|, Sr\.| II| III)$//) { @URLs = Get_matching_URLs($fname, $lname); my $nummatches = scalar(@URLs); if ($nummatches eq 1) { return $URLs[0]; } elsif ($nummatches > 1) { $ErrMsg .= " Multiple hits for '$lname, $fname'."; return $ErrMsg; } $ErrMsg .= " No hits for '$lname, $fname'."; } # Try like "C. Everett Coop" if ($fname =~ s/^.\. //) { @URLs = Get_matching_URLs($fname, $lname); my $nummatches = scalar(@URLs); if ($nummatches eq 1) { return $URLs[0]; } elsif ($nummatches > 1) { $ErrMsg .= " Multiple hits for '$lname, $fname'."; return $ErrMsg; } $ErrMsg .= " No hits for '$lname, $fname'."; } # Try like "John Q. Adams" if ($fname =~ s/\..*$//) { @URLs = Get_matching_URLs($fname, $lname); my $nummatches = scalar(@URLs); if ($nummatches eq 1) { return $URLs[0]; } elsif ($nummatches > 1) { $ErrMsg .= " Multiple hits for '$lname, $fname'."; return $ErrMsg; } $ErrMsg .= " No hits for '$lname, $fname'."; } return $ErrMsg; } sub Get_matching_URLs { my $firstname = shift; my $lastname = shift; my $url = 'http://bioguide.congress.gov/biosearch/biosearch1.asp'; my $ua = LWP::UserAgent->new; $ua->agent("Mozilla/6.0"); my @links = (); my $res = $ua->post($url, ['lastname' => $lastname, 'firstname' => $firstname]); if ($res->is_success) { my $content = $res->content; @links = ($content =~ m/<td><A HREF=\"([^"]*)\">/g); } else { print "could not connect, lastname = $lastname, firstname=$firstname" } return @links; } sub fix_dates { my $txt = shift; # century without AD,BC etc $txt =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi; $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi; $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi; $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\]\]/$1 centuries/gi; # century with AD,BC etc $txt =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi; $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi; $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\s(AD|BC|CE|BCE)\]\]/$1 centuries $2/gi; $txt =~ s/(\d(?:st|nd|rd|th))[ \-]Century/$1 century/gi; # piped decades and years $txt =~ s/\[\[(\d{1,4}\'?s)\]\]/$1/gi; $txt =~ s/\[\[(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi; $txt =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4})\]\]/$1/gi; $txt =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi; $txt =~ s/\[\[\d{1,4}s?\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi; $txt =~ s/\[\[\d{1,4}s?\|(\d{1,2}s?)\]\]/$1/gi; # months $txt =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December)\]\]/$1/gi; $txt =~ s/\[\[January\|(Jan)\]\]/$1/gi; $txt =~ s/\[\[February\|(Feb)\]\]/$1/gi; $txt =~ s/\[\[March\|(Mar)\]\]/$1/gi; $txt =~ s/\[\[April\|(Apr)\]\]/$1/gi; $txt =~ s/\[\[May\|(May)\]\]/$1/gi; $txt =~ s/\[\[June\|(Jun)\]\]/$1/gi; $txt =~ s/\[\[July\|(Jul)\]\]/$1/gi; $txt =~ s/\[\[August\|(Aug)\]\]/$1/gi; $txt =~ s/\[\[September\|(Sep)\]\]/$1/gi; $txt =~ s/\[\[October\|(Oct)\]\]/$1/gi; $txt =~ s/\[\[November\|(Nov)\]\]/$1/gi; $txt =~ s/\[\[December\|(Dec)\]\]/$1/gi; #month+year $txt =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d{3,4})\]\]/$1/gi; #Month+day_number "March 7th" -> "March 7" $txt =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December) (\d?\d)(?:th|st|nd|rd)\]\]/\[\[$1 $2\]\]/gi; $txt =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](?:th|st|nd|rd)/\[\[$1\]\]/gi; $txt =~ s/\[\[(\d?\d)(?:th|st|nd|rd) (January|February|March|April|May|June|July|August|September|October|November|December)\]\]/\[\[$1 $2\]\]/gi; #Month+day_number piped into number. Preferences do not work. They don't work in sequence because digits in the two dates must be adjacent $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with ndash or mdash instead of hyphen $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with slash instead of hyphen $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with ndash instead of hyphen $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with slash instead of hyphen $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with ndash instead of hyphen $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with slash instead of hyphen $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with ndash instead of hyphen $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with slash instead of hyphen $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; $txt =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1/gi; $txt =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2})\]\]/$1/gi; $txt =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2})\]\]/$1/gi; # solitary day_numbers $txt =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi; $txt =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi; $txt =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi; # days of the week in full. Optional plurals $txt =~ s/\[\[(Mondays?|Tuesdays?|Wednesdays?|Thursdays?|Fridays?|Saturdays?|Sundays?)\]\]/$1/gi; # days of the week abbreviated. Leave out 'Sun' as potentially valid link to the Sun. Leave out 'SAT' in upper case as potential link to 'Scholastic achievement/aptitude test'. $txt =~ s/\[\[(Mon|Tue|Tues|Wed|Thu|Thur|Thurs|Fri)\]\]/$1/gi; $txt =~ s/\[\[(Sat)\]\]/$1/g; $txt =~ s/\[\[Mondays?\|(Mondays?)\]\]/$1/gi; $txt =~ s/\[\[Tuesdays?\|(Tuesdays?)\]\]/$1/gi; $txt =~ s/\[\[Wednesdays?\|(Wednesdays?)\]\]/$1/gi; $txt =~ s/\[\[Thursdays?\|(Thursdays?)\]\]/$1/gi; $txt =~ s/\[\[Fridays?\|(Fridays?)\]\]/$1/gi; $txt =~ s/\[\[Saturdays?\|(Saturdays?)\]\]/$1/gi; $txt =~ s/\[\[Sundays?\|(Sundays?)\]\]/$1/gi; #4 digit years piped into 2 $txt =~ s/\[\[\d{1,4}\|(\d{1,2})\]\]/$1/gi; #year: examine characters in link on left for date, examine characters in link on right for date $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi; #year pair: examine characters in link on left for date, examine characters in link on right for date $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc\s\-]))/$1$2$3$4$5/gi; #year: examine characters in link on left for date, avoid links on right $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi; #year pair: examine characters in link on left for date, avoid links on right $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi; #year: check for line-ends, text on left, avoid links on right. Run twice to deal better with lists. $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi; $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi; #year pair: check for line-ends, text on left, avoid links on right $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3$4$5/gi; #year: avoid links on left, examine characters in link on right for date $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi; #year pair: avoid links on left, examine characters in link on right for date $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3$4$5/gi; #year:avoid links on left, text on right $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3/gi; #year pair: avoid links on left, text on right $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3$4$5/gi; #year:text on left, text on right $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3/gi; #year pair: avoid links on left, text on right $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3$4$5/gi; #year:avoid links on left, hyphen but no digits (to avoid ISO date) in link on right. Currently suspended because it isn't fully tested. #$txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[[^\d])/$1$2$3/gi; #year:avoid links on both sides $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi; #year pair: avoid links on both sides $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi; #'present' $txt =~ s/\[\[Present \(time\)\|(Present)\]\]/$1/gi; #Eliminate 'surprise links' also known as 'easter egg links' $txt =~ s/\[\[\d{1,4}s?\sin\s[^\|]{1,30}\|(\d{1,4}s?)\]\]/$1/gi; return $txt; } sub replace_unlinked_tokens { my $content = shift; # links $content =~ s/([^[|:])Amherst College/$1\[\[Amherst College\]\]/; $content =~ s/([^[|:])Confederate Army/$1\[\[Confederate States Army\]\]/; $content =~ s/([^[|:])Constitution of the United States/$1\[\[United States Constitution|Constitution of the United States\]\]/; $content =~ s/([^[|:])Democratic National Committee/$1\[\[Democratic National Committee\]\]/; $content =~ s/([^[|:])Democratic-Republican Party/$1\[\[Democratic-Republican Party (United States)|Democratic-Republican Party\]\]/; $content =~ s/([^[|:])Democratic Republican Party/$1\[\[Democratic-Republican Party (United States)|Democratic Republican Party\]\]/; $content =~ s/Department of Defense([^]|])/\[\[United States Department of Defense|Department of Defense\]\]$1/; $content =~ s/Department of War([^]|])/\[\[United States Department of War|Department of War\]\]$1/; $content =~ s/([^[|:])Eton College/$1\[\[Eton College\]\]/; $content =~ s/([^[|:])Free-Soil Party/$1\[\[Free Soil Party|Free-Soil Party\]\]/; $content =~ s/([^[|:])Harvard College/$1\[\[Harvard College\]\]/; $content =~ s/([^[|:])Republican National Committee/$1\[\[Republican National Committee\]\]/; $content =~ s/([^[|:])Union Army/$1\[\[Union Army\]\]/; $content =~ s/([^[|:])Union College/$1\[\[Union College\]\]/; $content =~ s/([^[|:])United States Army Medical Corps/$1\[\[Army Medical Department (United States)|United States Army Medical Corps\]\]/; $content =~ s/([^[|:])United States Army Reserve/$1\[\[United States Army Reserve\]\]/; $content =~ s/([^[|:])United States Treasury Department/$1\[\[United States Treasury Department\]\]/; $content =~ s/([^[|:])Washington, D\.C\./$1\[\[Washington, D.C.\]\]/; $content =~ s/([^[|:])William and Mary College/$1\[\[William and Mary College\]\]/; $content =~ s/([^[|:])Yale College/$1\[\[Yale College\]\]/; $content =~ s/as a Democrat/as a \[\[Democratic Party (United States)|Democrat\]\]/; $content =~ s/as a Federalist/as a \[\[Federalist Party (United States)|Federalist\]\]/; $content =~ s/as a Republican/as a \[\[Republican Party (United States)|Republican\]\]/; $content =~ s/as a Whig/as a \[\[Whig Party (United States)|Whig\]\]/; # grammar-related replacements $content =~ s/graduated, /graduated from /g; $content =~ s/lawyer, private/lawyer in private/g; $content =~ s/, (\d\d\d\d) ?- ?(\d\d\d\d)/ from $1 to $2/g; $content =~ s/(member|chairman|chair), /$1 of the /g; $content =~ s/\&\#146\;/'/g; $content =~ s/\&\#14[78]\;/"/g; return $content; } 1;