You could also just modify this code (released unde GPLv3) and use it to
strip out titles.
Stuff it into a file under linux called "parsetitle.c" and type:
gcc parsetitle.c -o parsetitle
./parsetitle < enwiki<date>.xml > titles.txt
Jeff
#include "platform.h"
#ifdef WINDOWS
#define strncasecmp strnicmp
#include "windows.h"
#include "winioctl.h"
#include "winuser.h"
#include "stdarg.h"
typedef UCHAR BYTE;
typedef USHORT WORD;
#include "stdio.h"
#include "stdlib.h"
#include "ctype.h"
#include "conio.h"
#endif
#ifdef LINUX
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <ctype.h>
#include <string.h>
//#include <ncurses.h>
#include <termios.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <net/if.h>
#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sched.h>
#include <ctype.h>
#include <openssl/md5.h>
#endif
char buffer[0x10000];
char title[4096];
int main(int argc, char *argv[])
{
register char *s, *p;
register int intitle = 0, i, f, inpage = 0;
register int titlefound = 0, revision = 0, inrev = 0;
while (s = fgets(buffer, 0x10000, stdin))
{
while (*s)
{
if (!*s || *s == '\n')
{
if (*s)
{
// putc(*s, stdout);
s++;
}
break;
}
if (!memcmp(s, "<page>", 6))
{
s += 6;
inpage++;
titlefound = 0;
revision = 0;
continue;
}
if (!memcmp(s, "</page>", 7))
{
s += 7;
if (!titlefound)
fprintf(stdout, "no article title?\n");
if (!revision)
fprintf(stdout, "no revision?\n");
titlefound = 0;
revision = 0;
if (inpage)
inpage--;
continue;
}
if (!memcmp(s, "</revision>", 11))
{
if (inrev)
inrev--;
s += 11;
continue;
}
if (!memcmp(s, "<revision>", 10))
{
inrev++;
revision = 1;
s += 10;
continue;
}
if (!memcmp(s, "<title>", 7))
{
intitle++;
s += 7;
p = strstr(s, "</title>");
if (p)
{
if (intitle)
intitle--;
if (p - s)
{
strncpy(title, s, p - s);
title[p - s] = '\0';
s += (p - s);
for (f=i=0; i < (p - s); i++)
{
if (!isspace(*p++))
f = 1;
}
if (f)
fprintf(stdout, "[%s] SPACES?\n", title);
else
fprintf(stdout, "[%s]\n", title);
}
else
fprintf(stdout, "[%s] NULL?\n", s);
titlefound = 1;
continue;
}
if (intitle)
{
intitle--;
printf("state error: title spanned segments [%s]\n", s);
continue;
}
}
// putc(*s, stdout);
s++;
}
}
return 0;
}
Matthew Flaschen wrote:
Brion Vibber wrote:
Harish TM wrote:
I was trying to parse the Wikipedia dumps but
unfortunately I find the XML
file that can be downloaded a little hard to parse. I was wondering if there
is a neat way to extract:
1. The article title
/mediawiki/page/title
2. The article content (
without links to articles
in other languages, external links and so on )
The article content *contains* those links, so I guess you mean you want
to parse the text and remove certain elements of it?
3. The category.
Again, that's part of article text.
Also I find that there are a large number of tools
that allow one to convert
plain text to media wiki text. What if I want to go the other way and
extract information exactly the way it appears on the wikipedia site.
Run the wiki parser on it.
Or download (
http://static.wikipedia.org/downloads/November_2006/en/) it
parsed.
Matthew Flaschen
------------------------------------------------------------------------
_______________________________________________
Wikitech-l mailing list
Wikitech-l(a)lists.wikimedia.org
http://lists.wikimedia.org/mailman/listinfo/wikitech-l