// StripWiki.cpp // // (C) Copyright 2006, Mike Blaszczak. All Rights Reserved. // This program is provided without warranty and confers no rights. // // This program is intended to provide a C++ solution to the "C++ vs. // Perl bakeoff" thread in the Web Mastering and Programming forum at // http://www.hardforums.com/ (specifically, you can view the thread // at http://www.hardforum.com/showthread.php?t=1077745). // // The program takes a MySQL SQL dump file from Wikipedia and chops // it up into a tab-delimited file. The specific Wikipedia source used // is page.sql.gz from http://download.wikimedia.org/enwiki/20060702/. // // stdafx.h isn't really used so it is easier to post this file // for others to see #include "stdafx.h" #define _CRT_SECURE_NO_DEPRECATE #define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers #include #include #include #include #include #include const char szInsert[] = "INSERT INTO `page` VALUES"; int _tmain(int argc, _TCHAR* argv[]) { // we expect three arguments if (argc != 3) { printf("Usage\n"); printf("\tStripWiki \n\n"); exit(2); } // open the reading file for text FILE* fInput = fopen(argv[1], "r"); if (fInput == NULL) { printf("Couldn't open \"%s\"\n", argv[1]); exit(3); } // open the writing file. note that we use binary mode, // and so we'll explicity write newlines -- but that lets // us skip some processing in the CRT. FILE* fOutput = fopen(argv[2], "wb"); if (fInput == NULL) { fclose(fInput); printf("Couldn't open \"%s\"\n", argv[2]); exit(3); } // allocate fgets buffer int nBufferSize = 1024 * 1024 * 3; char* pstrBuffer = (char*) malloc(nBufferSize); // get a line while (NULL != fgets(pstrBuffer, nBufferSize, fInput)) { // if the line is an insert, we'll process it if (0 == strncmp(pstrBuffer, szInsert, sizeof(szInsert)-1)) { // got a live one! // arrays pointing to start of each field, plus length of each field char* pstrFields[11]; int nLengths[11]; // bump past the INSERT statement char *pstrCurrent = pstrBuffer + sizeof(szInsert); // for each open parenthesis we find, we should have a tuple char *pstrOpenParen; while (NULL != (pstrOpenParen = strchr(pstrCurrent, '('))) { // got one; move past the paren pstrCurrent = pstrOpenParen+1; // that is the first field's start pstrFields[0] = pstrCurrent; // we don't know the first field's length yet nLengths[0] = -1; // and the current field is the second field int nCurrentField = 1; // true if we're escaping the next character bool bEscaped = false; bool bInQuote = false; char* pstrSource = pstrCurrent; char* pstrDest = pstrCurrent; int nLength = 0; // _mm_prefetch(pstrOpenParen + 128, _MM_HINT_NTA); // while we haven't found the closing parenthesis... char* pstrCloseParen = NULL; while (pstrCloseParen == NULL) { if (bInQuote) { // we're inside a quote mark! // are we going to escape the next character? if (*pstrCurrent == '\\' && !bEscaped) { // yep! bEscaped = true; } else { // if we have a tick and we're not escaping if (*pstrCurrent == '\'' && !bEscaped) { // then the quoted string has ended // remember we're not in a quote anymore bInQuote = false; } else if (*pstrCurrent == '_') { // convert underlines to spaces *pstrDest++ = ' '; } else { // otherwise, just store *pstrDest++ = *pstrCurrent; } // and we're certainly not escaping bEscaped = false; } } else { // we're not in a quoted string right now if (*pstrCurrent == '\'') { // did we find the start of one? bInQuote = true; } else if (*pstrCurrent == ')') { *pstrDest++ = '\r'; *pstrDest++ = '\n'; pstrCloseParen = pstrCurrent; } else if (*pstrCurrent == ',') { // how about a field seperator? // yep, write our own *pstrDest++ = '\t'; nCurrentField++; } else { // otherwise, just store *pstrDest++ = *pstrCurrent; } } // move to the next character pstrCurrent++; } // check that we really got 11 fields if (nCurrentField != 11) { // we didn't; something is very wrong. // print out some debugging info and abort printf("Error! Only %d fields\n", nCurrentField); for (int n = 0; n < 11; n++) { printf("%d: %d \"%s\"\n", n, nLengths[n], pstrFields[n]); } exit(1); } /* if (atoi(pstrFields[0]) == 725003) { printf("!"); } */ // write the data fwrite(pstrOpenParen + 1, 1, pstrDest - pstrOpenParen - 1, fOutput); } } } // clean up read buffer free(pstrBuffer); // close the files fclose(fOutput); fclose(fInput); // call it a day return 0; }