// StripWiki.cpp // // (C) Copyright 2006, Mike Blaszczak. All Rights Reserved. // This program is provided without warranty and confers no rights. // // This program is intended to provide a C++ solution to the "C++ vs. // Perl bakeoff" thread in the Web Mastering and Programming forum at // http://www.hardforums.com/ (specifically, you can view the thread // at http://www.hardforum.com/showthread.php?t=1077745). // // The program takes a MySQL SQL dump file from Wikipedia and chops // it up into a tab-delimited file. The specific Wikipedia source used // is page.sql.gz from http://download.wikimedia.org/enwiki/20060702/. // // stdafx.h isn't really used so it is easier to post this file // for others to see #include "stdafx.h" #define _CRT_SECURE_NO_DEPRECATE #define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers #include #include #include #include #include #include const char szInsert[] = "INSERT INTO `page` VALUES"; int _tmain(int argc, _TCHAR* argv[]) { // we expect three arguments if (argc != 3) { printf("Usage\n"); printf("\tStripWiki \n\n"); exit(2); } // open the reading file for text FILE* fInput = fopen(argv[1], "r"); if (fInput == NULL) { printf("Couldn't open \"%s\"\n", argv[1]); exit(3); } // open the writing file. note that we use binary mode, // and so we'll explicity write newlines -- but that lets // us skip some processing in the CRT. FILE* fOutput = fopen(argv[2], "wb"); if (fInput == NULL) { fclose(fInput); printf("Couldn't open \"%s\"\n", argv[2]); exit(3); } // allocate fgets buffer int nBufferSize = 1024 * 1024 * 3; char* pstrBuffer = (char*) malloc(nBufferSize); // get a line while (NULL != fgets(pstrBuffer, nBufferSize, fInput)) { // if the line is an insert, we'll process it if (0 == strncmp(pstrBuffer, szInsert, sizeof(szInsert)-1)) { // got a live one! // arrays pointing to start of each field, plus length of each field char* pstrFields[11]; int nLengths[11]; // bump past the INSERT statement char *pstrCurrent = pstrBuffer + sizeof(szInsert); // for each open parenthesis we find, we should have a tuple char *pstrOpenParen; while (NULL != (pstrOpenParen = strchr(pstrCurrent, '('))) { // got one; move past the paren pstrCurrent = pstrOpenParen+1; // that is the first field's start pstrFields[0] = pstrCurrent; // we don't know the first field's length yet nLengths[0] = -1; // and the current field is the second field int nCurrentField = 1; // true if we're escaping the next character bool bEscaped = false; // pstrSource is the start of a quoted string char* pstrSource = NULL; // pstrDest is where we're writing the next character to handle // escapes in a quoted string char* pstrDest; // _mm_prefetch(pstrOpenParen + 128, _MM_HINT_NTA); // while we haven't found the closing parenthesis... char* pstrCloseParen = NULL; while (pstrCloseParen == NULL) { if (pstrSource != NULL) { // we're inside a quote mark! // are we going to escape the next character? if (*pstrCurrent == '\\' && !bEscaped) { // yep! bEscaped = true; } else { // if we have a tick and we're not escaping if (*pstrCurrent == '\'' && !bEscaped) { // then the quoted string has ended. terminate it *pstrDest = '\t'; // and bump the start from the fields list. it was // recorded at the comma, and it really started after the tick pstrFields[nCurrentField-1]++; // figure the length nLengths[nCurrentField-1] = pstrDest - pstrFields[nCurrentField-1] + 1; // and remember we're not in a quote anymore pstrSource = NULL; } else if (*pstrCurrent == '_') { // convert underlines to spaces *pstrDest++ = ' '; } else { // otherwise, just store *pstrDest++ = *pstrCurrent; } // and we're certainly not escaping bEscaped = false; } } else { // we're not in a quoted string right now if (*pstrCurrent == '\'') { // did we find the start of one? pstrSource = pstrCurrent + 1; pstrDest = pstrSource; } else if (*pstrCurrent == ')') { // did we find the lcosing paren? nLengths[nCurrentField-1] = pstrCurrent - pstrFields[nCurrentField-1]; pstrCloseParen = pstrCurrent; // *pstrCurrent = 0; } else if (*pstrCurrent == ',') { // how about a field seperator? // if it wasn't a quoted string, we'll take the length here if (nLengths[nCurrentField-1] == -1) { nLengths[nCurrentField-1] = pstrCurrent - pstrFields[nCurrentField-1] + 1; *pstrCurrent = '\t'; } // make a note that we don't have a length for the next field nLengths[nCurrentField] = -1; // but this character marks the start of the current field pstrFields[nCurrentField++] = pstrCurrent+1; } } // move to the next character pstrCurrent++; } // check that we really got 11 fields if (nCurrentField != 11) { // we didn't; something is very wrong. // print out some debugging info and abort printf("Error! Only %d fields\n", nCurrentField); for (int n = 0; n < 11; n++) { printf("%d: %d \"%s\"\n", n, nLengths[n], pstrFields[n]); } exit(1); } /* if (atoi(pstrFields[0]) == 725003) { printf("!"); } */ // write the first field fwrite(pstrFields[0], 1, nLengths[0], fOutput); for (int n = 1; n < nCurrentField; n++) { // write a tab // fwrite("\t", 1, 1, fOutput); // then write the field content fwrite(pstrFields[n], 1, nLengths[n], fOutput); } // that's the end of the record, so write a newline fwrite("\r\n", 2, 1, fOutput); } } } // clean up read buffer free(pstrBuffer); // close the files fclose(fOutput); fclose(fInput); // call it a day return 0; }