Re: [Wikitech-l] WikiDump Parsing

26 Feb 2007

You could also just modify this code (released unde GPLv3) and use it to 
strip out titles.

Stuff it into a file under linux called "parsetitle.c" and type:

gcc parsetitle.c -o parsetitle

./parsetitle < enwiki<date>.xml > titles.txt

Jeff

#include "platform.h"

#ifdef WINDOWS

#define strncasecmp strnicmp

#include "windows.h"
#include "winioctl.h"
#include "winuser.h"
#include "stdarg.h"
typedef UCHAR BYTE;
typedef USHORT WORD;
#include "stdio.h"
#include "stdlib.h"
#include "ctype.h"
#include "conio.h"

#endif

#ifdef LINUX

#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <ctype.h>
#include <string.h>
//#include <ncurses.h>
#include <termios.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <net/if.h>
#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sched.h>
#include <ctype.h>
#include <openssl/md5.h>

#endif

char buffer[0x10000];
char title[4096];

int main(int argc, char *argv[])
{
    register char *s, *p;
    register int intitle = 0, i, f, inpage = 0;
    register int titlefound = 0, revision = 0, inrev = 0;

    while (s = fgets(buffer, 0x10000, stdin))
    {
       while (*s)
       {
          if (!*s || *s == '\n')
          {
             if (*s)
             {
//                putc(*s, stdout);
                s++;
             }
             break;
          }

          if (!memcmp(s, "<page>", 6))
          {
             s += 6;
             inpage++;
             titlefound = 0;
             revision = 0;
             continue;
          }

          if (!memcmp(s, "</page>", 7))
          {
             s += 7;

             if (!titlefound)
                fprintf(stdout, "no article title?\n");

             if (!revision)
                fprintf(stdout, "no revision?\n");

             titlefound = 0;
             revision = 0;
             if (inpage)
                inpage--;
             continue;
          }

          if (!memcmp(s, "</revision>", 11))
          {
             if (inrev)
                inrev--;
             s += 11;
             continue;        
          }

          if (!memcmp(s, "<revision>", 10))
          {
             inrev++;
             revision = 1;
             s += 10;
             continue;        
          }

          if (!memcmp(s, "<title>", 7))
          {
             intitle++;
             s += 7;

             p = strstr(s, "</title>");
             if (p)
             {
                if (intitle)
                   intitle--;

                if (p - s)
                {
                   strncpy(title, s, p - s);
                   title[p - s] = '\0';
                   s += (p - s);

                   for (f=i=0; i < (p - s); i++)
                   {
                      if (!isspace(*p++))
                         f = 1;
                   }
                   if (f)
                      fprintf(stdout, "[%s] SPACES?\n", title);
                   else
                      fprintf(stdout, "[%s]\n", title);
                }
                else
                   fprintf(stdout, "[%s] NULL?\n", s);

                titlefound = 1;
                continue;
             }

             if (intitle)
             {
                intitle--;
                printf("state error:  title spanned segments [%s]\n", s); 
                continue;
             }
          }
//          putc(*s, stdout);
          s++;
       }
    }
    return 0;
}

Matthew Flaschen wrote:

...
 Brion Vibber wrote:

 Harish TM wrote:

 I was trying to parse the Wikipedia dumps but
unfortunately I find the XML
file that can be downloaded a little hard to parse. I was wondering if there
is a neat way to extract:
                         1. The article title

 /mediawiki/page/title

                          2. The article content (
without links to articles
in other languages, external links and so on )

 The article content *contains* those links, so I guess you mean you want
to parse the text and remove certain elements of it?

                          3. The category.

 Again, that's part of article text.

 Also I find that there are a large number of tools
that allow one to convert
plain text to media wiki text. What if I want to go the other way and
extract information exactly the way it appears on the wikipedia site.

 Run the wiki parser on it.

Or download (http://static.wikipedia.org/downloads/November_2006/en/) it
parsed.

Matthew Flaschen

------------------------------------------------------------------------

_______________________________________________
Wikitech-l mailing list
Wikitech-l(a)lists.wikimedia.org
http://lists.wikimedia.org/mailman/listinfo/wikitech-l

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003

2002

Re: [Wikitech-l] WikiDump Parsing