JargonWiki:Jargon2Wiki.php
From JargonWiki
Note: SVN access, most recent versions, documentation, version history, and more are available at the Google Code site for this project:
<?php
# @title PHP Jargon File to MediaWiki Converter
# @version 0.2
# @author Charles File
# @link http://jargonwiki.com/
#
# This script is designed to accept input in the format used by the plaintext version of
# The Jargon File and output to either screen or local disk that same content in a format:
# a) Readable and Useful to MediaWiki
# b) Usable by BulkPageCreator.php, which actually submits the articles to the wiki.
#
# If you're interested, and/or what to try out this script, you can get your hands on a
# copy of a text version of the Jargon File at:
# http://catb.org/jargon/oldversions/
#
# As of this writing, the latest version is:
# http://catb.org/jargon/oldversions/jarg447.txt
#
# Syntax:
#
# Invoked via a Command Line Interface (CLI):
# > php Jargon2Wiki.php input [mode] [outputfile] [debug flag]
# for instance:
# > php Jargon2Wiki.php inputfile.txt file outputfile.txt 1
#
# - or -
#
# Invoked via a Uniform Resource Identifier (URI):
# http://../Jargon2Wiki.php?file=inputfile.txt(&mode=file&out=outputfile.txt&debug=1)
#
# If not passed in the argument, mode defaults to screen, output file defaults to output.txt,
# and debug defaults to off.
#
# Note: This parser is meant to be run in a controlled, local environment,
# to prepare and wiki-ify content before it is submitted to a MediaWiki.
# Deployment to open access is completely untested and unsupported.
# If you are looking to dynamically parse and submit content of unknown
# trustworthiness to your Wiki, I suggest you use something else.
#
# Known issues:
# This whole thing could probably be a bit more efficient by combining many of the string replacement
# functions. But it won't change much big-O wise, and since its only a parser meant to be run rarely,
# I'm not very concerned.
# Lots of general clean-up nescessary.
# Using globals is hack-y and potentially a security risk.
# Hell, this thing is full of potential security risks.
# Error handling and reporting is rudimentary, at best.
#
# Resolved Issues as of v0.2:
# See All: Still not parsing correctly.
# Some articles are of the form :word: 1. n. Noun definition. 2. v. Verb definition. Grrr....
# Ordered lists behave badly.
# Need to parse and wiki-ify image links.
#
# Some variables that might need changing:
$strCRLF = "\r\n";
$strJargonFileVersion = "4.4.7";
$boolDebug = 0; //1 to enable debug mode. [obv. I hope].
# @param string strAbbrev An abbreviation string.
# @return string What the passed abbreviation string stands for.
function getAbbreviationValue($strParamAbbreviation) {
$strReturn = "";
if ($strParamAbbreviation == "abbrev.") {
$strReturn = "Abbreviation";
} else if ($strParamAbbreviation == "adj.") {
$strReturn = "Adjective";
} else if ($strParamAbbreviation == "adv.") {
$strReturn = "Adverb";
} else if ($strParamAbbreviation == "alt.") {
$strReturn = "Alternate";
} else if ($strParamAbbreviation == "cav.") {
$strReturn = "Caveat";
} else if ($strParamAbbreviation == "conj.") {
$strReturn = "Conjunction";
} else if ($strParamAbbreviation == "esp.") {
$strReturn = "Especially";
} else if ($strParamAbbreviation == "excl.") {
$strReturn = "Exclamation";
} else if ($strParamAbbreviation == "imp.") {
$strReturn = "Imperative";
} else if ($strParamAbbreviation == "interj.") {
$strReturn = "Interjection";
} else if ($strParamAbbreviation == "n.") {
$strReturn = "Noun";
} else if ($strParamAbbreviation == "obs.") {
$strReturn = "Obsolete";
} else if ($strParamAbbreviation == "pl.") {
$strReturn = "Plural";
} else if ($strParamAbbreviation == "poss.") {
$strReturn = "Possibly";
} else if ($strParamAbbreviation == "pref.") {
$strReturn = "Prefix";
} else if ($strParamAbbreviation == "prob.") {
$strReturn = "Probably";
} else if ($strParamAbbreviation == "prov.") {
$strReturn = "Proverbial";
} else if ($strParamAbbreviation == "quant.") {
$strReturn = "Quantifier";
} else if ($strParamAbbreviation == "suff.") {
$strReturn = "Suffix";
} else if ($strParamAbbreviation == "v.") {
$strReturn = "Verb";
} else if ($strParamAbbreviation == "var.") {
$strReturn = "Variant";
} else if ($strParamAbbreviation == "vi.") {
$strReturn = "Intransitive Verb";
} else if ($strParamAbbreviation == "vt.") {
$strReturn = "Transitive Verb";
} //if..else
return $strReturn;
} //function
# @param string strParamDerivation A string containing information on an article's origins.
# @return string The detected origin.
function getEtymology($strParamDerivation) {
$strEtymology = '';
if (eregi("unix", $strParamDerivation)) {
$strEtymology = "Unix";
} else if (eregi("irc", $strParamDerivation)) {
$strEtymology = "IRC";
} else if (eregi("(usenet|newsgroup)", $strParamDerivation)) {
$strEtymology = "Usenet";
} else if (eregi("(email|@)", $strParamDerivation)) {
$strEtymology = "Email";
} else if (eregi("(http|www|web)", $strParamDerivation)) {
$strEtymology = "World Wide Web";
} else if (eregi("ascii", $strParamDerivation)) {
$strEtymology = "ASCII";
} else if (eregi("ansi", $strParamDerivation)) {
$strEtymology = "ANSI";
} else if (eregi("greek", $strParamDerivation)) {
$strEtymology = "Greek";
} else if (eregi("french", $strParamDerivation)) {
$strEtymology = "French";
} else if (eregi("german", $strParamDerivation)) {
$strEtymology = "German";
} else if (eregi("spanish", $strParamDerivation)) {
$strEtymology = "Spanish";
} else if (eregi("latin", $strParamDerivation)) {
$strEtymology = "Latin";
} else if (eregi("japanese", $strParamDerivation)) {
$strEtymology = "Japanese";
} else if (eregi("chinese", $strParamDerivation)) {
$strEtymology = "Chinese";
} else if (eregi("indian", $strParamDerivation)) {
$strEtymology = "Indian";
} else if (eregi("ftp", $strParamDerivation)) {
$strEtymology = "FTP";
} //if..else
return $strEtymology;
} //function
# @param array arrParam A one-dimensional array.
# @return string A listing of that array's keys and values on a single line.
function describeArray($arrParam) {
$strReturn = '';
if (is_array($arrParam)) {
foreach ($arrParam as $strKeyParam => $strValueParam) {
$strReturn .= "[" . $strKeyParam . "] => " . $strValueParam . " ";
} //foreach
} //if
return $strReturn;
} //function
# @param string strParamArticle An article in Jargon File format to be parsed into MediaWiki format.
# @return string The argument string in MediaWiki format.
function parseArticle($strParamArticle) {
# Get some globals we'll need.
global $strCRLF;
global $strJargonFileVersion;
global $boolDebug;
# Also get rid of any CRLF's or leading ":"'s at the start of the article,
# so that after the split() all $arrArticle values start similarly, without a leading ":".
while (substr($strParamArticle, 0, 1) == $strCRLF || substr($strParamArticle, 0, 1) == ":") {
$strParamArticle = substr($strParamArticle, 1);
} //while
# Find string positions of characters that appear immediately after data that we want to pull from the article.
# We'll use these string positions as indexes in order to pull substrings that contain article data we want.
$intIndexEndTitle = strpos($strParamArticle, ":");
$intIndexEndFirstLine = strpos($strParamArticle, $strCRLF);
# Split current article (in $strParamArticle) into Title, Info, and Body.
# Title is substring of article string starting at beginning of article and ending at first ":".
$strTitle = substr($strParamArticle, 0, $intIndexEndTitle);
# Info is substring of article string starting at first ":" and ending at first new line character.
$strInfo = substr($strParamArticle, $intIndexEndTitle + 1, $intIndexEndFirstLine - $intIndexEndTitle);
$strInfo = trim($strInfo);
$strInfo = preg_replace("/\s+/", ' ', $strInfo);
# Body is substring of article string starting at first new line character and ending where the article ends.
$strBody = substr($strParamArticle, $intIndexEndFirstLine);
# Split Info on " " in order to seperate the words.
$arrInfo = explode(" ", $strInfo);
# Step through the words in arrInfo, in order to organize them into Pronunciation, Abbreviation, and Other.
$arrPronunciation = array();
$arrAbbreviation = array();
$arrOther = array();
foreach ($arrInfo as $strKeyInfo => $strValueInfo) {
$strValueInfo = trim(str_replace(',', '', $strValueInfo));
# arrPronunciation will contain words containing "/" character(s).
if (!(strpos($strValueInfo, "/") === FALSE)) {
array_push($arrPronunciation, $strValueInfo);
# arrAbbreviation will contain words containing "." character(s).
} else if (!(strpos($strValueInfo, ".") === FALSE)) {
array_push($arrAbbreviation, $strValueInfo);
# Other will contain everything else.
} else {
array_push($arrOther, $strValueInfo);
} //if..else
} //foreach
if (count($arrPronunciation) > 0) {
if ($arrPronunciation[0] == "//") {
$arrPronunciation[0] = "";
} //if
} //if
# Some articles are of the form :word: 1. n. Noun definition. 2. v. Verb definition. Grrr....
# So now let's search the whole damn thing just to double-check.
if (!(count($arrAbbreviation) > 0)) {
$arrAbbreviationCatcher = array();
preg_match_all("/\s((abbrev)|(adj)|(adv)|(alt)|(cav)|(conj)|(esp)|(excl)|(imp)|(interj)|(n)|(obs)|(pl)|(poss)|(pref)|(prob)|(prov)|(quant)|(suff)|(v)|(var)|(vi)|(vt))\.(\s|\,)/", $strBody, $arrAbbreviationCatcher, PREG_PATTERN_ORDER);
foreach ($arrAbbreviationCatcher[0] as $strKeyAbbreviationCatcher => $strValueAbbreviationCatcher) {
array_push($arrAbbreviation, $strValueAbbreviationCatcher);
} //foreach
} //if
# Find and wiki-ify image references.
$strBody = preg_replace("/\[\s*(\S+\.((png)|(gif)|(jpg)|(jpeg)))\s*\]/", "[[Image:$1]]", $strBody);
# Wiki format body text.
# Remove two line breaks at the start of article bodies.
$strBody = preg_replace("/^". $strCRLF . $strCRLF ."\s+/", "", $strBody);
# Remove line breaks between lines within paragraphs,
# but retain two line breaks between paragraphs.
$strBody = preg_replace("/". $strCRLF . $strCRLF ."/", "--PARAGRAPH BREAK--", $strBody);
$strBody = preg_replace("/\s+/", " ", $strBody);
$strBody = str_replace("--PARAGRAPH BREAK--", $strCRLF . $strCRLF, $strBody);
# Remove white space at the start of paragraphs.
$strBody = str_replace($strCRLF ." ", $strCRLF, $strBody);
# Find See Also: entries in the page.
# First, find phrases that contain:
# See (Also [optional])(: [optional]) {[some word]} ({[optional additional word]}) ({[ditto]}) (...)
preg_match_all("/(((S|s)ee)|((C|c)ompare)|((O|o)ppose)|((A|a)lso))(\s+(A|a)lso)?:?\s+\{[^\}]+\}([^\.\)]*\{[^\}]+\})*(\.|\))/", $strBody, $arrMatchPhraseSeeAlso, PREG_SET_ORDER);
$arrSeeAlso = array();
# Now step through these phrases, and pull out just the substrings that appear between "{" and "}".
# These are the actual article names that should be linked to in a "See Also" section of the Wiki article.
foreach ($arrMatchPhraseSeeAlso as $strKeyPhraseSeeAlso => $arrValuePhraseSeeAlso) {
preg_match_all("/\{[^\}]+\}/", $arrValuePhraseSeeAlso[0], $arrMatchSeeAlso, PREG_PATTERN_ORDER);
# Next pull the "See Also" substrings out of the messy array of matches, format for Wiki,
# and push them onto a clean new array for later access.
foreach($arrMatchSeeAlso[0] as $strKeyMatchSeeAlso => $strValueMatchSeeAlso) {
$strValueMatchSeeAlso = str_replace("{", "[[", str_replace("}", "]]", $strValueMatchSeeAlso));
array_push($arrSeeAlso, $strValueMatchSeeAlso);
} //foreach
} //foreach
# Dedupe!
$arrSeeAlso = array_unique($arrSeeAlso);
# Find Derivation information in the page.
# Do this by searching for a string in brackets at the start of the body.
# Note: The text inside the brackets should not begin with "/(A|a)lt(.|ernate|ernative) /".
# Which we will assume is "Alternate" text, containing entries that have similar definitions,
# or alternate spellings/renderings/dialectic interpretations of the entry in question.
# (More on Alternate text later.)
preg_match("/\[[^\]](?![Aa]lt(\.|ernat(iv)?e)?:?\s+)[^\]]+\][^\]]/", $strBody, $arrMatchDerivation);
$strDerivation = '';
if (count($arrMatchDerivation) > 0) {
$strDerivation = $arrMatchDerivation[0];
$strDerivation = str_replace("[", "", str_replace("]", "", $strDerivation));
# Capitalize.
$strDerivation = strtoupper(substr($strDerivation, 0, 1)) . substr($strDerivation, 1);
# Add a full stop to the end (if necessary).
if ($strDerivation != "" && substr($strDerivation, -1, 1) != ".") {
$strDerivation .= ".";
} //if
} //if
# Note: To include parentheses as a delimitting criterion in this silliness,
# inculcate the following lines with the previous, where appropriate.
// preg_match("/(\[|\()[^\]\)]+(\]|\))/", $strBody, $arrMatchDerivation);
// $strDerivation = str_replace("(", "", str_replace(")", "", $strDerivation));
# Get Etymology info from Derivation info.
$strEtymology = getEtymology($strDerivation);
# Find Alternate term or spelling information in the page.
# Search the body for a string that's in parentheses or brackets that starts with "/(A|a)lt(.|ernate|ernative) /".
# Note the white space character terminating the match string at the end of the previous line;
# Its there so we avoid matching Usenet group names (e.g. alt.arts.poetry.comments, etc.)
preg_match("/(\(|\[)(A|a)lt(\.|ernat(e|ive))?:?\s[^\)\]]+(\)|\])/", $strBody, $arrMatchAlternate);
$strAlternate = '';
if (count($arrMatchAlternate) > 0) {
$strAlternate = $arrMatchAlternate[0];
# Remove "(A|a)lt(\.|ernate|ernative)?:?\s"
$strAlternate = preg_replace("/(A|a)lt(\.|ernat(e|ive))?:?\s/", "", $strAlternate);
$strAlternate = str_replace("(", "", str_replace(")", "", $strAlternate));
$strAlternate = str_replace("[", "", str_replace("]", "", $strAlternate));
$strAlternate = preg_replace("/" . $strCRLF . "/", "", $strAlternate);
} //if
# Wiki format ordered lists.
# Looking for (a newline, space?, then a number, OR string start, space?, followed by a "1")
# [need this or we match numbers at the ends of sentences]
# then a period then some space.
# ASSUMPTION: All ordered lists start with "1".
# Deemed safe enough for government work. Any definition that doesn't is missing the most important, anyway!
$strBody = preg_replace(array("/((" . $strCRLF . "\s*1)|(^\s*1))\.\s+/", "/" . $strCRLF . "\s*[0-9]+\.\s+/"), array("# ", $strCRLF . "# "), $strBody);
# Wiki format links
$strBody = str_replace("{", "[[", str_replace("}", "]]", $strBody));
# Create link to the Jargon File [This isn't quite foolproof].
$strJargonFileLink = "http://catb.org/jargon/html/";
# If article title starts with a letter, use that letter for the directory that the article file appears in.
$strTitleFirstChar = substr($strTitle, 0, 1);
if (preg_match("/[a-zA-Z]/", $strTitleFirstChar)) {
$strJargonFileDir = strtoupper($strTitleFirstChar);
# Otherwise, if it starts with a number or non-alphanumeric character,
# use "0" for the directory that the article file appears in.
} else {
$strJargonFileDir = "0";
} //if..else
$strJargonFileLink .= $strJargonFileDir . "/";
# Replace spaces and slashes in the article title with hyphens,
# and use that as working Jargon File article file name.
$strJargonFileLinkTitle = str_replace("@", "at", str_replace("\\", "-", str_replace("/", "-", str_replace(" ", "-", $strTitle))));
# If there's a bang at the end, make it a hyphen.
$strJargonFileLinkTitle = preg_replace("/\!$/", "-", $strJargonFileLinkTitle);
# Now remove non-URL characters.
$strJargonFileLinkTitle = preg_replace("/[^a-zA-Z0-9\?\&\_\-]/", "", $strJargonFileLinkTitle);
# Cut off non alpha-numeric characters from the front of the filename.
while (preg_match("/^[^a-zA-Z0-9]/", $strJargonFileLinkTitle)) {
$strJargonFileLinkTitle = substr($strJargonFileLinkTitle, 1);
} //if
$strJargonFileLink .= $strJargonFileLinkTitle;
$strJargonFileLink .= ".html";
# Create Wiki article in the format usable by the batch upload script "BulkPageCreator.php".
$strArticle = str_replace(":", "", $strTitle);
$strArticle .= $strCRLF;
# Add formatting information for "BulkPageCreator.php".
$strArticle .= "--ENDTITLE--";
$strArticle .= $strCRLF;
# Add the "This comes from The Jargon File" template.
$strArticle .= "{{Jargon File|" . $strJargonFileVersion . "}}";
$strArticle .= $strCRLF;
# Add the "Jargon Information Summary" template.
$strArticle .= "{{Jargon|Term=" . $strTitle . "|";
# Add pronunciation information to the template call in the format:
# Pronunciation=/foo/|Pronunciation-2=/fu/|Pronunciation-3=/fuh/
$intCount = 1;
foreach ($arrPronunciation as $strPronounceKey => $strPronounceValue) {
$strArticle .= "Pronunciation";
if ($intCount > 1) {
$strArticle .= "-" . $intCount;
} //if
$strArticle .= "=" . $strPronounceValue . "|";
$intCount++;
} //foreach
# Add usage information to the template call in the format:
# Usage=[[Grammar:Noun|n.]] [[Grammar:Obsolete|obs.]] ... [[Grammar:Adjective|adj.]]
$strArticle .= "Usage=";
$strUsage = "";
foreach ($arrAbbreviation as $strAbbrevKey => $strAbbrevValue) {
$strUsage .= " [[Grammar:" . getAbbreviationValue($strAbbrevValue) . "|" . $strAbbrevValue . "]]";
} //foreach
$strUsage = substr($strUsage, 1); //remove intial space
$strArticle .= $strUsage;
$strArticle .= "|";
# Add etymology information to the template call in the format:
# Etymology=[[Etymology:Unix|Unix]]
$strArticle .= "Etymology=";
if ($strEtymology != "") {
$strArticle .= "[[Etymology:" . $strEtymology . "|" . $strEtymology . "]]";
} //if
$strArticle .= "|";
# Add derivation information to the template call in the format:
# Derivation=This comes from Unix
$strArticle .= "Derivation=" . str_replace("{", "[[", str_replace("}", "]]", $strDerivation)) . "|";
# Add alternate term/spelling information to the template call in the format (for example, could be anything):
# Alternate=Foobar /`fu bar/
$strArticle .= "Alternate=" . str_replace("{", "[[", str_replace("}", "]]", $strAlternate)) . "|";
# Add see also information to the template call in the format:
# Also=[[Foo]], [[Bar]], ..., [[Baz]]
$strArticle .= "Also=";
$strAlso = "";
foreach ($arrSeeAlso as $strAlsoKey => $strAlsoValue) {
$strAlso .= str_replace("{", "[[", str_replace("}", "]]", $strAlsoValue));
$intCount++;
} //foreach
$strAlso = str_replace("]][[", "]], [[", $strAlso);
$strArticle .= $strAlso;
$strArticle .= "}}";
$strArticle .= $strCRLF;
$strArticle .= $strCRLF;
# Add article text.
# First add title in bold.
$strArticle .= "'''" . $strTitle . "''':";
# Next add pronunciations.
foreach ($arrPronunciation as $strPronounceKey => $strPronounceValue) {
$strArticle .= " " . $strPronounceValue;
} //foreach
# Finally, add abbreviations.
foreach ($arrAbbreviation as $strAbbrevKey => $strAbbrevValue) {
$strArticle .= " [[Grammar:" . getAbbreviationValue($strAbbrevValue) . "|" . $strAbbrevValue . "]]";
} //foreach
$strArticle .= $strCRLF;
$strArticle .= $strCRLF;
# Add article body.
$strArticle .= $strBody;
$strArticle .= $strCRLF;
$strArticle .= $strCRLF;
# Add "Sources" section.
$strArticle .= "== Sources ==";
$strArticle .= $strCRLF;
# Add "Source" template with a link to the JargonWiki article on The Jargon File.
$strArticle .= "{{Source|Source:Jargon_File|The Jargon File|version " . $strJargonFileVersion . "}}";
$strArticle .= $strCRLF;
$strArticle .= $strCRLF;
# Add "External links" section.
$strArticle .= "== External links ==";
$strArticle .= $strCRLF;
# Add link to the page containing the same article on The Jargon File Website.
$strArticle .= "{{External Source|" . $strJargonFileLink . "|" . $strTitle . "| in [[Source:Jargon_File|The Jargon File]].}}";
$strArticle .= $strCRLF;
$strArticle .= $strCRLF;
# Add "This article is in the public domain" template.
$strArticle .= "{{Public Domain}}";
$strArticle .= $strCRLF;
$strArticle .= $strCRLF;
# Add relevant category.
$strArticle .= "[[Category:Hacker|{{PAGENAME}}]]";
$strArticle .= $strCRLF;
$strArticle .= $strCRLF;
# Add formatting information for "BulkPageCreator.php".
$strArticle .= "--ENDPAGE--";
$strArticle .= $strCRLF;
# This is the end of the construction of a wiki-formatted article for output to a text file
# and eventual upload to the MediaWiki-powered host.
# Below: HTML output used to debug during development.
$strDebug = '';
if ($boolDebug) {
$strDebug .= "<div style=\"border:1px solid black; margin:.8em; padding:.8em;\">";
$strDebug .= "<div style=\"border:1px dashed black; margin:.8em; padding:.8em;\">";
$strDebug .= "<h3>HTML Format</h3>";
$strDebug .= "<p></p>" . $strCRLF;
$strDebug .= "<p></p>" . $strCRLF;
$strDebug .= "<p>";
$strDebug .= "title: " . str_replace(":", "", $strTitle) . "\n";
$strDebug .= "</p><p>" . $strCRLF;
$strDebug .= "etymology: ";
$strDebug .= $strEtymology;
$strDebug .= "</p><p>" . $strCRLF;
$strDebug .= "derivation: ";
$strDebug .= $strDerivation;
$strDebug .= "</p><p>" . $strCRLF;
$strDebug .= "pronunciations: ";
$strDebug .= describeArray($arrPronunciation);
$strDebug .= "</p><p>" . $strCRLF;
$strDebug .= "abbreviations: ";
$strDebug .= describeArray($arrAbbreviation);
$strDebug .= "</p><p>" . $strCRLF;
$strDebug .= "alternates: ";
$strDebug .= $strAlternate;
$strDebug .= "</p><p>" . $strCRLF;
$strDebug .= "info: ";
$strDebug .= describeArray($arrInfo[0]);
$strDebug .= "</p><p>" . $strCRLF;
$strDebug .= "see also: ";
$strDebug .= describeArray($arrSeeAlso);
$strDebug .= "</p><p>" . $strCRLF;
$strDebug .= "body: " . $strBody . $strCRLF;
$strDebug .= "</p>";
$strDebug .= "</div>";
$strDebug .= "<div style=\"border:1px dashed black; margin:.8em; padding:.8em;\">";
$strDebug .= "<h3>Wiki Format</h3>";
$strArticle = $strDebug . $strArticle;
} //if
return $strArticle;
} //function
# @param string strParam ASCII data to be appended to a file on disc.
# @param string strParamOutputFileName The name of the file to which the data will be appended. If file does not exist, it will be created.
# @return boolean True on successful write to disc, false otherwise.
function fappend($strParam, $strParamOutputFileName) {
$fdFileOut = fopen($strParamOutputFileName, "a");
if ($fdFileOut) {
fwrite($fdFileOut, $strParam, strlen($strParam));
$boolReturn = true;
} else {
echo "Output file failed to open. Bad or duplicate file name.";
$boolReturn = false;
} //if
fclose($fdFileOut);
return $boolReturn;
} //function
# Find the file we will process, and read it in.
# $_GET and $SERVER['argv'] are BOTH set when the script runs, so isset() is useless.
# Further, no matter how php gets the arguments, via GET, POST, CLI, or CGI, it will
# ALWAYS store those variables in $_SERVER['args'].
# Ergo count($_SERVER['args']) is useless as well, as it will be the same result
# no matter how the variables were passed.
# However, $_GET and $_POST will ONLY be set if the arguments are passed via those methods.
# Therefore in order to test for execution mode, count()ing those arrays are the only
# things that will return meaningful information about HOW the arguments were passed.
# Also, this all means that assuming that the GET variables are passed in the same order
# that the CLI arguments are passed, then they could be retrieved via $argv, as well.
# Adding the conditional test adds flexibility and convenience, as well as the helpfulness
# of knowing if we are outputting to a web server or a command shell.
# Note, however, that this is a far from foolproof method. If no variables are passed via
# GET, then we'll bump to the else portion of the below test and be in "CLI mode," even
# though the script was invoked via HTTP rather than CLI.
if (count($_GET) > 0) {
# Via URI:
if (array_key_exists("file", $_GET)) { $strInputFileName = $_GET["file"]; }
if (array_key_exists("mode", $_GET)) { $strOutputMode = $_GET["mode"]; }
if (array_key_exists("out", $_GET)) { $strOutputFileName = $_GET["out"]; }
if (array_key_exists("debug", $_GET)) { $boolDebug = $_GET["debug"]; }
} else {
# Via CLI:
if (array_key_exists(1, $argv)) { $strInputFileName = $argv[1]; }
if (array_key_exists(2, $argv)) { $strOutputMode = $argv[2]; }
if (array_key_exists(3, $argv)) { $strOutputFileName = $argv[3]; }
if (array_key_exists(4, $argv)) { $boolDebug = $argv[4]; }
} //if..else
# End initialization. Begin processing, provided we have something to process.
if (isset($strInputFileName) && file_exists($strInputFileName)) {
$strArticleBlockRemainder = '';
$intArticleCount = 0;
$arrArticle = array();
$intInputFileSize = filesize($strInputFileName);
if (isset($strOuputFileName)) {
if ($strOutputMode == "file" && $strOutputFileName == "") {
$strOutputFileName = "output.txt";
} //if
} else {
$strOutputFileName = "output.txt";
} //if..else
# Open Source File and Read.
$fdArticles = fopen($strInputFileName, "r");
if ($intInputFileSize) { //fread will toss a warning on a 0 byte read.
if ($fdArticles) { //ensure handle opened ok.
while (!feof($fdArticles)) {
$strArticles = fread($fdArticles, 8192); //8192 is fread()'s max byte length.
# Split $strArticles into $arrArticle array on "\n\n:", which immediately preceeds the title of each article.
# Also get rid of leading ":" at the start of the file,
# so that after the split() all $arrArticle values start similarly, without a leading ":".
$arrArticleBlock = split($strCRLF . $strCRLF . ":", $strArticles);
# Tack on the leftovers from the last block of text to the first element of this array.
$arrArticleBlock[0] = $strArticleBlockRemainder . $arrArticleBlock[0];
# Update the leftovers with what's in this block of text, and remove it from the end of the array.
$strArticleBlockRemainder = array_pop($arrArticleBlock);
# Add this block's articles to the big article array in the sky.
# Loop through each item in the array of articles.
# During loop we will search the article for data on the article's term and build the Wiki-formatted entry.
foreach($arrArticleBlock as $strKeyArticleBlock => $strValueArticleBlock) {
$intArticleCount++;
# Output wiki-formatted article text.
$strArticle = parseArticle($strValueArticleBlock);
if ($strOutputMode == "file") {
fappend($strArticle, $strOutputFileName);
echo "Article " . $intArticleCount . " successfully written to " . $strOutputFileName . "." . $strCRLF;
} else {
echo $strArticle;
} //if
if ($boolDebug) {
echo "</div>";
echo "</div>";
} //if
} //foreach
} //while
# Since we always pop off the last element in the block array AFTER it has a chance to be
# parsed (required because except for the edge case we have to wait for the subsequent read
# [which takes place in the next iteration of the loop] in order to get the reast of the
# article text) in order to parse the whole article. The short of this is that we need one
# last parse call for the final article, which assuming that the file ends with a complete article
# should be a complete article (e.g. if there is no subsequent read, there is nothing to tack
# onto the end of the remainder string, and if our assumption that the file ends with a complete
# article is true, then since there is nothing to tack on we must assume that what we have in the
# remainder already IS a complete article, QED). Let's finish this pig.
$intArticleCount++;
$strArticle = parseArticle($strArticleBlockRemainder);
if ($strOutputMode == "file") {
# Output wiki-formatted article text.
fappend($strArticle, $strOutputFileName);
echo "Article " . $intArticleCount . " successfully written to " . $strOutputFileName . "." . $strCRLF;
} else {
echo $strArticle;
} //if
if ($boolDebug) {
echo "</div>";
echo "</div>";
} //if
fclose($fdArticles);
} //if
} //if
} else {
echo "Please pass a valid input filename in the arguments.";
} //if
exit;
?>
[edit]
External links
- Google Code site for this project: http://code.google.com/p/jargonwiki/

