Re: [Wikitech-l] WikiDump Parsing

26 Feb 2007

      You could also just modify this code (released unde GPLv3) and use it to 
strip out titles.
Stuff it into a file under linux called "parsetitle.c" and type:
gcc parsetitle.c -o parsetitle
./parsetitle < enwiki<date>.xml > titles.txt
Jeff
#include "platform.h"
#ifdef WINDOWS
#define strncasecmp strnicmp
#include "windows.h"
#include "winioctl.h"
#include "winuser.h"
#include "stdarg.h"
typedef UCHAR BYTE;
typedef USHORT WORD;
#include "stdio.h"
#include "stdlib.h"
#include "ctype.h"
#include "conio.h"
#endif
#ifdef LINUX
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <ctype.h>
#include <string.h>
//#include <ncurses.h>
#include <termios.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <net/if.h>
#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sched.h>
#include <ctype.h>
#include <openssl/md5.h>
#endif
char buffer[0x10000];
char title[4096];
int main(int argc, char *argv[])
{
    register char *s, *p;
    register int intitle = 0, i, f, inpage = 0;
    register int titlefound = 0, revision = 0, inrev = 0;
while (s = fgets(buffer, 0x10000, stdin))
    {
       while (*s)
       {
          if (!*s || *s == '\n')
          {
             if (*s)
             {
//                putc(*s, stdout);
                s++;
             }
             break;
          }
if (!memcmp(s, "<page>", 6))
          {
             s += 6;
             inpage++;
             titlefound = 0;
             revision = 0;
             continue;
          }
if (!memcmp(s, "</page>", 7))
          {
             s += 7;
if (!titlefound)
                fprintf(stdout, "no article title?\n");
if (!revision)
                fprintf(stdout, "no revision?\n");
titlefound = 0;
             revision = 0;
             if (inpage)
                inpage--;
             continue;
          }
if (!memcmp(s, "</revision>", 11))
          {
             if (inrev)
                inrev--;
             s += 11;
             continue;        
          }
if (!memcmp(s, "<revision>", 10))
          {
             inrev++;
             revision = 1;
             s += 10;
             continue;        
          }
if (!memcmp(s, "<title>", 7))
          {
             intitle++;
             s += 7;
p = strstr(s, "</title>");
             if (p)
             {
                if (intitle)
                   intitle--;
if (p - s)
                {
                   strncpy(title, s, p - s);
                   title[p - s] = '\0';
                   s += (p - s);
for (f=i=0; i < (p - s); i++)
                   {
                      if (!isspace(*p++))
                         f = 1;
                   }
                   if (f)
                      fprintf(stdout, "[%s] SPACES?\n", title);
                   else
                      fprintf(stdout, "[%s]\n", title);
                }
                else
                   fprintf(stdout, "[%s] NULL?\n", s);
titlefound = 1;
                continue;
             }
if (intitle)
             {
                intitle--;
                printf("state error:  title spanned segments [%s]\n", s); 
                continue;
             }
          }
//          putc(*s, stdout);
          s++;
       }
    }
    return 0;
}
Matthew Flaschen wrote:
...
Brion Vibber wrote:
...
Harish TM wrote:
...
I was trying to parse the Wikipedia dumps but unfortunately I find the XML
file that can be downloaded a little hard to parse. I was wondering if there
is a neat way to extract:
                        1. The article title
/mediawiki/page/title
...
                    2. The article content ( without links to articles

in other languages, external links and so on )
The article content *contains* those links, so I guess you mean you want
to parse the text and remove certain elements of it?
...
                    3. The category.

Again, that's part of article text.
...
Also I find that there are a large number of tools that allow one to convert
plain text to media wiki text. What if I want to go the other way and
extract information exactly the way it appears on the wikipedia site.
Run the wiki parser on it.
Or download (http://static.wikipedia.org/downloads/November_2006/en/) it
parsed.
Matthew Flaschen

Wikitech-l mailing list
Wikitech-l@lists.wikimedia.org
http://lists.wikimedia.org/mailman/listinfo/wikitech-l

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003

2002

Re: [Wikitech-l] WikiDump Parsing