// StripWikiSharp.cs // // (C) Copyright 2006, Mike Blaszczak. All Rights Reserved. // This program is provided without warranty and confers no rights. // // This program is intended to provide a C# solution to the "C++ vs. // Perl bakeoff" thread in the Web Mastering and Programming forum at // http://www.hardforums.com/ (specifically, you can view the thread // at http://www.hardforum.com/showthread.php?t=1077745). // // The program takes a MySQL SQL dump file from Wikipedia and chops // it up into a tab-delimited file. The specific Wikipedia source used // is page.sql.gz from http://download.wikimedia.org/enwiki/20060702/. // using System; using System.Collections.Generic; using System.Text; using System.IO; namespace StripWikiSharp { class Program { // name of the output file static string strFileName = "c:\\output.txt"; static string strInputFileName = "F:\\links\\enwiki-20060702-page.sql"; // everybody uses the writer static StreamWriter writer; // takes a string which has quotes and escapes // and does the substitutions to get rid of them. // returns the substituted string static string CleanString(string str) { string strShorter = str.Substring(1, str.Length - 2); string strReplaced = strShorter.Replace('_', ' '); string strReplaced2 = strReplaced.Replace("\\'", "'"); string strReplaced3 = strReplaced2.Replace("\\\"", "\""); string strReplaced4 = strReplaced3.Replace("\\\\", "\\"); return strReplaced4; } // if we had a remainder during our last trip, this // holds it. static string strPartial = ""; // process a MySQL INSERT INTO statement into a bunch of tab-separated tuples static void ProcessString(string str) { // handle the remainder string strTotal = strPartial + str; string[] strFields = new string[11]; bool bInQuotes = false; bool bEscaped = false; int nOpenParenIndex = -1; int nCloseParenIndex = -1; int nIndex = 0; int nField = 0; int nFieldStart = -1; while (nIndex < strTotal.Length) { if (nOpenParenIndex == -1) { // looking for open paren if (strTotal[nIndex] == '(') { nOpenParenIndex = nIndex; nFieldStart = nIndex + 1; } } else { // looking for close paren, but can't have it in a quote if (bInQuotes) { if (!bEscaped) { if (strTotal[nIndex] == '\\') bEscaped = true; else if (strTotal[nIndex] == '\'') bInQuotes = false; } else { bEscaped = false; } } else { // not in quotes if (strTotal[nIndex] == '\'') bInQuotes = true; else { if (strTotal[nIndex] == ')') { strFields[nField++] = strTotal.Substring(nFieldStart, nIndex - nFieldStart); nCloseParenIndex = nIndex; nFieldStart = -1; } else if (strTotal[nIndex] == ',') { strFields[nField++] = strTotal.Substring(nFieldStart, nIndex - nFieldStart); nFieldStart = nIndex + 1; } } } } nIndex++; // do we have everything? if (nCloseParenIndex >= 0) { // yep; write to the output file writer.Write(strFields[0]); for (int n = 1; n < 11; n++) { writer.Write("\t"); if (n == 2 || n == 3 || n == 8) { // some fields need cleaning writer.Write(CleanString(strFields[n])); } else { // others don't writer.Write(strFields[n]); } } // real newline; surprisingly, StreamWriter doesn't substitute writer.Write("\r\n"); // reset our indexes nCloseParenIndex = -1; nOpenParenIndex = -1; nFieldStart = -1; nField = 0; } } // ran out of characters. do we have to remember a substring? if (nOpenParenIndex != -1) { strPartial = strTotal.Substring(nOpenParenIndex); } } static void Main(string[] args) { // create the writer from the filename writer = new StreamWriter(strFileName); // create a reader, too. StreamReader reader = new StreamReader(strInputFileName); // skip to values string str; bool bFound = false; while (null != (str = reader.ReadLine())) { int n = str.IndexOf("VALUES"); if (n >= 0) { bFound = true; str = str.Substring(n + 7); break; } } // we found "VALUES", right? if (!bFound) { Console.WriteLine("Premature end of file"); return; } // great; process what we have right now ProcessString(str); // then loop to process what we continue to read while (null != (str = reader.ReadLine())) { ProcessString(str); } // flush and close the writer writer.Flush(); writer.Close(); // close the reader reader.Close(); } } }