#!/usr/bin/php -q
$THESCRIPT = "wiki.phtml" ;
#include ( "./specialPages.php" ) ;
include ( "./databaseFunctions.php" ) ;
include ( "./basicFunctions.php" ) ;
include ( "./wikiTitle.php" ) ;
include ( "./wikiPage.php" ) ;
include ( "./wikiUser.php" ) ;
include ( "./wikiSettings.php" ) ;
include ( "./wikiLocalSettings.php" ) ;
## Language-dependant variables!
## Default/English:
$wikiTalk = "Talk";
$fieldSeparator = "\xb3" ;
$wikiSeeAlso = "See also" ;
$wikiConversionScript = "conversion script" ;
$wikiAutomatedConversion = "Automated conversion" ;
function recodeCharsetStub ( $text ) {
# Temporarily, we are still using ISO-8859-1 in English
return $text;
}
function recodeCharsetLatin1 ( $text ) {
# To convert ISO-8859-1 to UTF-8
return encode_utf8 ( $text ) ;
}
$recodeCharset = recodeCharsetStub ;
#$recodeCharset = recodeCharsetLatin1 ;
## Esperanto:
if ( $wikiLanguage =="eo" ) {
$wikiTalk = "Priparolu" ;
$wikiSeeAlso = "Legu anka\xc5\xad" ;
$wikiConversionScript = "konvertilo" ;
$wikiAutomatedConversion = "A\xc5\xadtomata konvertado" ;
function recodeCharsetEo ( $text ) {
$x = array(
"Cx", "cx", "CX", "cX",
"Gx", "gx", "GX", "gX",
"Hx", "hx", "HX", "hX",
"Jx", "jx", "JX", "jX",
"Sx", "sx", "SX", "sX",
"Ux", "ux", "UX", "uX");
$u = array(
"\xc4\x88", "\xc4\x89", "\xc4\x88", "\xc4\x89",
"\xc4\x9c", "\xc4\x9d", "\xc4\x9c", "\xc4\x9d",
"\xc4\xa4", "\xc4\xa5", "\xc4\xa4", "\xc4\xa5",
"\xc4\xb4", "\xc4\xb5", "\xc4\xb4", "\xc4\xb5",
"\xc5\x9c", "\xc5\x9d", "\xc5\x9c", "\xc5\x9d",
"\xc5\xac", "\xc5\xad", "\xc5\xac", "\xc5\xad");
return str_replace ( $x , $u , $text ) ;
}
$recodeCharset = recodeCharsetEo ;
}
## French
if ( $wikiLanguage == "fr" ) {
$wikiTalk = "Discuter" ;
$recodeCharset = recodeCharsetLatin1 ;
}
## Polish:
if ( $wikiLanguage == "pl" ) {
$wikiTalk = "Dyskusja" ;
$fieldSeparator = "\xff";
function RecodeCharsetPl ( $text ) {
# Convert iso8859-2 to UTF-8
# In a happy world, we could use iconv for this
$l2u8 = array ( "\xA0" => "\xc2\xa0", "\xA1" => "\xc4\x84", "\xA2" => "\xcb\x98", "\xA3" => "\xc5\x81", "\xA4" => "\xc2\xa4", "\xA5" => "\xc4\xbd",
"\xA6" => "\xc5\x9a", "\xA7" => "\xc2\xa7", "\xA8" => "\xc2\xa8", "\xA9" => "\xc5\xa0", "\xAA" => "\xc5\x9e", "\xAB" => "\xc5\xa4",
"\xAC" => "\xc5\xb9", "\xAD" => "\xc2\xad", "\xAE" => "\xc5\xbd", "\xAF" => "\xc5\xbb", "\xB0" => "\xc2\xb0", "\xB1" => "\xc4\x85",
"\xB2" => "\xcb\x9b", "\xB3" => "\xc5\x82", "\xB4" => "\xc2\xb4", "\xB5" => "\xc4\xbe", "\xB6" => "\xc5\x9b", "\xB7" => "\xcb\x87",
"\xB8" => "\xc2\xb8", "\xB9" => "\xc5\xa1", "\xBA" => "\xc5\x9f", "\xBB" => "\xc5\xa5", "\xBC" => "\xc5\xba", "\xBD" => "\xcb\x9d",
"\xBE" => "\xc5\xbe", "\xBF" => "\xc5\xbc", "\xC0" => "\xc5\x94", "\xC1" => "\xc3\x81", "\xC2" => "\xc3\x82", "\xC3" => "\xc4\x82",
"\xC4" => "\xc3\x84", "\xC5" => "\xc4\xb9", "\xC6" => "\xc4\x86", "\xC7" => "\xc3\x87", "\xC8" => "\xc4\x8c", "\xC9" => "\xc3\x89",
"\xCA" => "\xc4\x98", "\xCB" => "\xc3\x8b", "\xCC" => "\xc4\x9a", "\xCD" => "\xc3\x8d", "\xCE" => "\xc3\x8e", "\xCF" => "\xc4\x8e",
"\xD0" => "\xc4\x90", "\xD1" => "\xc5\x83", "\xD2" => "\xc5\x87", "\xD3" => "\xc3\x93", "\xD4" => "\xc3\x94", "\xD5" => "\xc5\x90",
"\xD6" => "\xc3\x96", "\xD7" => "\xc3\x97", "\xD8" => "\xc5\x98", "\xD9" => "\xc5\xae", "\xDA" => "\xc3\x9a", "\xDB" => "\xc5\xb0",
"\xDC" => "\xc3\x9c", "\xDD" => "\xc3\x9d", "\xDE" => "\xc5\xa2", "\xDF" => "\xc3\x9f", "\xE0" => "\xc5\x95", "\xE1" => "\xc3\xa1",
"\xE2" => "\xc3\xa2", "\xE3" => "\xc4\x83", "\xE4" => "\xc3\xa4", "\xE5" => "\xc4\xba", "\xE6" => "\xc4\x87", "\xE7" => "\xc3\xa7",
"\xE8" => "\xc4\x8d", "\xE9" => "\xc3\xa9", "\xEA" => "\xc4\x99", "\xEB" => "\xc3\xab", "\xEC" => "\xc4\x9b", "\xED" => "\xc3\xad",
"\xEE" => "\xc3\xae", "\xEF" => "\xc4\x8f", "\xF0" => "\xc4\x91", "\xF1" => "\xc5\x84", "\xF2" => "\xc5\x88", "\xF3" => "\xc3\xb3",
"\xF4" => "\xc3\xb4", "\xF5" => "\xc5\x91", "\xF6" => "\xc3\xb6", "\xF7" => "\xc3\xb7", "\xF8" => "\xc5\x99", "\xF9" => "\xc5\xaf",
"\xFA" => "\xc3\xba", "\xFB" => "\xc5\xb1", "\xFC" => "\xc3\xbc", "\xFD" => "\xc3\xbd", "\xFE" => "\xc5\xa3", "\xFF" => "\xcb\x99" ) ;
return strtr ( $text , $l2u8 ) ;
}
$recodeCharset = recodeCharsetPl ;
}
## Spanish
if ( $wikiLanguage == "es" ) {
$wikiTalk = $recodeCharset ( "Discusión" ) ;
$recodeCharset = recodeCharsetLatin1 ;
}
## Where to get the old usemod database files from:
#$rootDir = "/home/groups/w/wi/wikipedia/htdocs/fpw/wiki-de/lib-http/db/wiki/page/" ;
#$rootDir = "/home/manske/wiki/lib-http/db/wiki/page/" ;
$rootDir = "/home/wiki-eo/lib-http/db/wiki/page/" ;
#$rootDir = "/tmp/home/wiki-pl/wiki/db/page/" ;
#$rootDir = "/tmp/home/wiki-eo/lib-http/db/wiki/page/" ;
$oldid = 100; # Need unique identifiers for page histories; increment each time we use it.
$FS = $fieldSeparator ;
$FS1 = $FS."1" ;
$FS2 = $FS."2" ;
$FS3 = $FS."3" ;
function scanText2 ( $fn ) {
global $fieldSeparator , $FS , $FS1 , $FS2 , $FS3 ;
$ret = "" ;
#CONSTANTS
#$FS = "³" ;
#READING FILE
$t = array () ;
$fd = fopen ( $fn , "r" ) ;
if ( $fd == false ) return "There was an error converting this file : file not found." ;
while (!feof($fd)) {
$buffer = fgets($fd, 99999);
array_push ( $t , $buffer ) ;
}
fclose ( $fd ) ;
#array_pop ( $t ) ;
$t = implode ( "" , $t ) ;
/*
#SPLIT PAGE
$sp = explode ( $FS1 , $t ) ;
$x = array_pop ( $sp ) ;
$sections = explode ( $FS2 , $x ) ;
foreach ( $sections as $y ) {
$text = explode ( $FS3 , $y ) ;
foreach ( $text as $z ) {
if ( $ret == "text" ) return $z ;
$ret = $z ;
}
}
return $ret ;
*/
$page = splitHash ( $FS1 , $t ) ;
$section = splitHash ( $FS2 , $page["text_default"] ) ;
$text = splitHash ( $FS3 , $section["data"] ) ; # text_default
return array ( "text" => $text["text"] , "summary" => $text["summary"] ,
"minor" => $text["minor"] , "ts" => $section["ts"] ,
"username" => $section["username"] , "host" => $section["host"] ) ;
}
function getFileName ( $an ) {
global $rootDir ;
$ret = $rootDir ;
$sd = ucfirst ( substr ( $an , 0 , 1 ) ) ;
if ( $sd < "A" or $sd > "Z" ) $sd = "other" ;
$ret .= "$sd/".ucfirst($an).".db" ;
return $ret ;
}
function fixLinks ( $s ) {
global $npage , $ll , $ull , $allTopics ;
global $wikiTalk , $wikiSeeAlso ;
$talk = explode ( "/" , $npage->secureTitle ) ;
if ( count($talk)==2 and strtolower($talk[1])==strtolower($wikiTalk) ) $isTalkPage = true ;
else $isTalkPage = false ;
# Automatic backlink from a subpage to a "main" page
$backLink = "" ;
if ( $isTalkPage == false AND count ( $talk ) == 2 ) {
$backLink = ucfirst ( $talk[0] ) ;
$backLink = str_replace ( "_" , " " , $backLink ) ;
}
# Automatic subpages, one last time...
$s = ereg_replace ( "([\n ])/([A-Z][a-zA-Z0-9]+)" , "\\1[[/\\2|/\\2]]" , $s ) ;
$s = " $s" ;
$a = explode ( "[[" , $s ) ;
$s = array_shift ( $a ) ;
foreach ( $a as $x ) {
$b = explode ( "]]" , $x , 2 ) ;
$s .= "[[" ;
if ( count ( $b ) == 1 ) $s .= $x ;
else {
$c = explode ( "|" , $b[0] ) ;
$link = $c[0] ;
if ( substr ( $link , 0 , 1 ) == "/" ) { # Converting subpages
$u = explode ( "/" , $npage->title ) ;
if ( count ( $c ) == 1 ) array_push ( $c , substr ( $link , 1 ) ) ;
$link = $u[0].$link ;
}
if ( ucfirst ( str_replace ( "_" , " " , $link ) ) == $backLink ) $backLink = "" ; # No backlink necessary
$n = str_replace ( " " , "_" , $link ) ;
$n = ucfirst ( $n ) ;
$m = substr ( $n , 0 , 1 ) ;
if ( $m < "A" or $m > "Z" ) $m = "0" ;
if ( in_array ( $n , $allTopics[$m] ) ) array_push ( $ll , $n ) ;
else array_push ( $ull , $n ) ;
# Re-linking /Talk pages to talk:
$talk = explode ( "/" , $link ) ;
if ( $talk[0] == "HomePage" ) {
$talk[0] = "Main_Page" ;
$link = $talk[0] ;
if ( count ( $talk ) == 2 ) $link .= "/".$talk[1] ;
}
if ( count ( $talk ) == 2 and strtolower($talk[1]) == strtolower($wikiTalk) ) $link = strtolower($wikiTalk).":".$talk[0] ;
else if ( $isTalkPage ) {
if ( count ( $c ) == 1 ) array_push ( $c , $link ) ;
$link = ":".$link ;
}
$s .= $link ;
if ( count ( $c ) == 2 ) $s .= "|".$c[1] ;
$s .= "]]".$b[1] ;
}
}
if ( $backLink != "" ) $backLink = "\n:''$wikiSeeAlso :'' [[$backLink]]" ;
return substr ( $s , 1 ).$backLink ;
}
function convertText ( $s ) {
global $recodeCharset ;
/*
$s = str_replace ( "\\'" , "'" , $s ) ;
$s = str_replace ( "\\\"" , "\"" , $s ) ;
$s = str_replace ( "\"" , "\\\"" , $s ) ;
$s = str_replace ( "'" , "\\'" , $s ) ;
*/
$s = strtr ( $recodeCharset ( $s ) ,
array ( "\\" => "\\\\" , "\"" => "\\\"" , "'" => "\\'" ) ) ;
$a = spliti ( "" , $s ) ;
$s = fixLinks ( array_shift ( $a ) ) ;
foreach ( $a as $x ) {
$b = spliti ( "" , $x , 2 ) ;
if ( count ( $b ) == 1 ) $s .= "".$x ;
else $s .= "".$b[0]."".fixLinks($b[1]);
}
return $s ;
}
function splitHash ( $sep , $str ) {
$temp = explode ( $sep , $str ) ;
$ret = array () ;
for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
$ret[$temp[$i]] = $temp[++$i] ;
}
return $ret ;
}
function makeSafe ( $str ) {
# Escape strings and convert character set so we can insert text into the db
global $recodeCharset ;
return strtr ( $recodeCharset ( $str ) ,
array ( "\\" => "\\\\" , "\"" => "\\\"" , "'" => "\\'" ) ) ;
}
function getHistory ( $title , $st) {
global $fieldSeparator , $FS , $FS1 , $FS2 , $FS3 , $oldid , $recodeCharset ;
$f = preg_replace ( "/\/page\/(.*)\.db$/" , "/keep/\$1.kp" , getFileName ( $title ) ) ;
if ( !file_exists ( $f ) ) return "" ;
$fc = file ( $f ) ;
$keptlist = explode ( $FS1 , implode ( "\n" , $fc ) ) ;
array_shift ( $keptlist ) ;
$lastoldid = "0"; $sql = array () ;
foreach ( $keptlist as $rev ) {
$section = splitHash ( $FS2 , $rev ) ;
$text = splitHash ( $FS3 , $section["data"] ) ;
$user = makeSafe ( $section["username"] ? $section["username"] : $section["host"] ) ;
if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
array_push ( $sql , "INSERT INTO old (old_title,old_text,old_comment,old_user,old_user_text,old_old_version,old_timestamp,old_minor_edit) "
. "VALUES (\"$st\",\"" . makeSafe($text["text"]) . "\",\"" . makeSafe($text["summary"]) . "\","
. "0,\"$user\",$lastoldid,FROM_UNIXTIME(" . $section["ts"] . ")," . $text["minor"] . ")" ) ;
$lastoldid = "LAST_INSERT_ID()" ;
} else {
echo " (note: skipped a bad old revision)";
}
}
return $sql ;
}
function storeInDB ( $title , $text ) {
global $of , $npage , $ll , $ull , $wikiTalk , $recodeCharset , $oldid , $wikiAutomatedConversion , $wikiConversionScript ;
global $historyOnly , $noHistory , $connection ;
$ll = array () ;
$ull = array () ;
$title = str_replace ( "\\'" , "'" , $title ) ;
$title = str_replace ( "\\\"" , "\"" , $title ) ;
$npage = new wikiPage ;
$npage->title = $recodeCharset ( $title ) ;
$npage->makeAll () ;
$thetext = convertText ( $text["text"] ) ;
#$ll1 = implode ( "\n" , $ll ) ;
#$ull1 = implode ( "\n" , $ull ) ;
$st = $npage->secureTitle ;
# Move talk pages to talk namespace
$talk = explode ( "/" , $st ) ;
if ( $talk[0] == "HomePage" ) { $talk[0] = "Main_Page" ; $st = $talk[0] ; }
if ( count ( $talk ) == 2 and $talk[1] == $wikiTalk ) $st = $wikiTalk.":".$talk[0] ;
if ( count ( $talk ) == 2 and $talk[1] == strtolower($wikiTalk) ) return ;
$lastoldid = "0"; $sql = array () ;
if ( ! $noHistory ) { # for testing { ****
$sql = getHistory ( $title , $st ) ;
if ( ! $sql ) $sql = array () ;
if (count($sql)) $lastoldid = "LAST_INSERT_ID()" ;
# Insert untouched version as the last in the history chain
$user = makeSafe ( $text["username"] ? $text["username"] : $text["host"] ) ;
if ( $text["text"] && $text["minor"] != "" && ( $text["ts"]*1 > 0 ) ) {
array_push ( $sql , "INSERT INTO old (old_title,old_text,old_comment,old_user,old_user_text,old_old_version,old_timestamp,old_minor_edit) "
. "VALUES (\"$st\",\"" . makeSafe($text["text"]) . "\",\"" . makeSafe($text["summary"]) . "\","
. "0,\"$user\",$lastoldid,FROM_UNIXTIME(" . $text["ts"] . ")," . $text["minor"] . ")" );
$lastoldid = "LAST_INSERT_ID()" ;
} else {
echo " (skipping last old revision - $text[ts] $text[minor] $text[text])";
}
} # for testing **** }
if ( $historyOnly ) {
$sqlx = "SELECT cur_old_version FROM cur WHERE cur_title=\"$st\"" ;
$result = mysql_query ( $sqlx , $connection ) ;
echo mysql_error () ;
if ( $s = mysql_fetch_object ( $result ) ) {
$oid = $s->cur_old_version ;
mysql_free_result ( $result ) ;
} else $oid = 0 ;
if ( $oid )
array_push ( $sql , "UPDATE old SET old_old_version=LAST_INSERT_ID(),old_timestamp=old_timestamp WHERE old_old_version=0 AND old_title=\"$st\"" ) ;
else
array_push ( $sql , "UPDATE cur SET cur_old_version=LAST_INSERT_ID(),cur_timestamp=cur_timestamp WHERE cur_title=\"$st\"" ) ;
foreach ( $sql as $line ) {
mysql_query ( $line , $connection ) ;
echo mysql_error ();
}
} else {
$sql = implode ( ";\n" , $sql ) ;
$sql .= ";\nINSERT INTO cur (cur_title,cur_ind_title,cur_text,cur_comment,cur_user,cur_user_text,cur_old_version,cur_minor_edit) VALUES ";
$sql .= "(\"$st\",\"$st\",\"$thetext\",";
$sql .= "\"$wikiAutomatedConversion\",0,\"$wikiConversionScript\",$lastoldid,1);\n" ;
foreach ( $ll as $l ) {
$sql .= "INSERT INTO linked (linked_from,linked_to) VALUES (\"$st\",\"$l\");\n";
}
foreach ( $ull as $l ) {
$sql .= "INSERT INTO unlinked (unlinked_from,unlinked_to) VALUES (\"$st\",\"$l\");\n";
}
fwrite ( $of , $sql ) ;
}
}
function getTopics ( $dir ) {
$ret = array () ;
$mydir = opendir($dir);
while ($entry = readdir($mydir)) {
if ($entry != '.' && $entry != '..') {
if ( is_dir ( "$dir/$entry" ) ) {
$a = getTopics ( "$dir/$entry" ) ;
foreach ( $a as $x ) array_push ( $ret , "$entry/$x" ) ;
} else {
if (substr ($entry, strlen ( $entry ) - 3, 3) == '.db') {
$x = substr ( $entry , 0 , strlen ( $entry ) - 3 ) ;
array_push ( $ret , $x ) ;
} else {
print "Warning: File \"$entry\" doesn't seem to contain an article
\n" ;
}
}
}
}
closedir($mydir);
return $ret ;
}
function dir2DB ( $letter ) {
global $rootDir ;
$a = getTopics ( "$rootDir/$letter" ) ;
print "Reading :\n" ;
foreach ( $a as $an ) {
$fl = substr ( $an , 0 , 1 ) ;
if ( $fl >= "a" and $fl <= "z" ) {
print "IGNORING LOWERCASE FIRST FILE : $an\n" ;
} else {
print "$an" ;
$fn = getFileName ( $an ) ;
storeInDB ( $an , scantext2 ( $fn ) ) ;
print "\n" ;
}
}
print "\n" ;
}
function getAllTopics () {
global $allTopics , $rootDir , $wikiTalk;
global $recodeCharset ;
$allTopics = array () ;
for ( $c = 65 ; chr($c) <= "Z" ; $c++ ) {
$allTopics[chr($c)] = getTopics ( "$rootDir/".chr($c) ) ;
for ( $i = 0; $i < count($allTopics[chr($c)]) ; $i++ )
$allTopics[chr($c)][$i] = $recodeCharset ( $allTopics[chr($c)][$i] ) ;
}
$allTopics["0"] = getTopics ( "$rootDir/other" ) ;
for ( $i = 0; $i < count($allTopics["0"]) ; $i++ )
$allTopics["0"][$i] = $recodeCharset ( $allTopics["0"][$i] ) ;
}
# MAIN PROGRAM
global $rootDir , $historyOnly , $connection ;
$historyOnly = in_array ( "--history" , $argv ) ; # To import only old histories into database
$noHistory = in_array ( "--nohistory" , $argv ) ; # For testing
set_time_limit ( 0 ) ; # Enough time for this script...
global $ll , $ull , $allTopics ;
$ll = array () ;
$ull = array () ;
getAllTopics () ;
global $l , $of ;
if ( ! $historyOnly ) {
$of = fopen ( "./newiki.sql" , "w" ) ;
$nw = file ( "./wikipedia.sql" ) ;
foreach ( $nw as $line ) fwrite ( $of , $line ) ;
fwrite ( $of , "DELETE FROM cur WHERE cur_title NOT LIKE \"%:%\";\n" ) ;
fwrite ( $of , "DELETE FROM cur WHERE cur_title LIKE \"$wikiTalk:%\";\n" ) ;
} else {
# We need a live database to check against to import the histories
//establish user connection
$connection = mysql_pconnect($wikiThisDBserver , $wikiThisDBuser , $wikiThisDBpassword )
or die("Could not get connection to database server.") ;
//open up database
mysql_select_db ($wikiSQLServer , $connection)
or die("Could not select database: $wikiSQLServer");
}
do {
if ( !isset ( $l ) ) $l = 65 ;
if ( $l == "other" ) $letter = "other" ;
else $letter = chr ( $l ) ;
$nl = $l+1 ;
if ( $letter == "Z" ) $nl = "other" ;
$l = $nl ;
dir2DB ( $letter ) ;
} while ( $letter != "other" ) ;
if (isset ( $of )) fclose ( $of ) ;
print "FINISHED!\n" ;
?>