You could also just modify this code (released unde GPLv3) and use it to strip out titles.
Stuff it into a file under linux called "parsetitle.c" and type:
gcc parsetitle.c -o parsetitle
./parsetitle < enwiki<date>.xml > titles.txt
Jeff
#include "platform.h"
#ifdef WINDOWS
#define strncasecmp strnicmp
#include "windows.h" #include "winioctl.h" #include "winuser.h" #include "stdarg.h" typedef UCHAR BYTE; typedef USHORT WORD; #include "stdio.h" #include "stdlib.h" #include "ctype.h" #include "conio.h"
#endif
#ifdef LINUX
#include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <fcntl.h> #include <ctype.h> #include <string.h> //#include <ncurses.h> #include <termios.h> #include <sys/ioctl.h> #include <sys/stat.h> #include <pthread.h> #include <sys/types.h> #include <sys/socket.h> #include <netinet/in.h> #include <net/if.h> #include <stdio.h> #include <errno.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <sched.h> #include <ctype.h> #include <openssl/md5.h>
#endif
char buffer[0x10000]; char title[4096];
int main(int argc, char *argv[]) { register char *s, *p; register int intitle = 0, i, f, inpage = 0; register int titlefound = 0, revision = 0, inrev = 0;
while (s = fgets(buffer, 0x10000, stdin)) { while (*s) { if (!*s || *s == '\n') { if (*s) { // putc(*s, stdout); s++; } break; }
if (!memcmp(s, "<page>", 6)) { s += 6; inpage++; titlefound = 0; revision = 0; continue; }
if (!memcmp(s, "</page>", 7)) { s += 7;
if (!titlefound) fprintf(stdout, "no article title?\n");
if (!revision) fprintf(stdout, "no revision?\n");
titlefound = 0; revision = 0; if (inpage) inpage--; continue; }
if (!memcmp(s, "</revision>", 11)) { if (inrev) inrev--; s += 11; continue; }
if (!memcmp(s, "<revision>", 10)) { inrev++; revision = 1; s += 10; continue; }
if (!memcmp(s, "<title>", 7)) { intitle++; s += 7;
p = strstr(s, "</title>"); if (p) { if (intitle) intitle--;
if (p - s) { strncpy(title, s, p - s); title[p - s] = '\0'; s += (p - s);
for (f=i=0; i < (p - s); i++) { if (!isspace(*p++)) f = 1; } if (f) fprintf(stdout, "[%s] SPACES?\n", title); else fprintf(stdout, "[%s]\n", title); } else fprintf(stdout, "[%s] NULL?\n", s);
titlefound = 1; continue; }
if (intitle) { intitle--; printf("state error: title spanned segments [%s]\n", s); continue; } } // putc(*s, stdout); s++; } } return 0; }
Matthew Flaschen wrote:
Brion Vibber wrote:
Harish TM wrote:
I was trying to parse the Wikipedia dumps but unfortunately I find the XML file that can be downloaded a little hard to parse. I was wondering if there is a neat way to extract: 1. The article title
/mediawiki/page/title
2. The article content ( without links to articles
in other languages, external links and so on )
The article content *contains* those links, so I guess you mean you want to parse the text and remove certain elements of it?
3. The category.
Again, that's part of article text.
Also I find that there are a large number of tools that allow one to convert plain text to media wiki text. What if I want to go the other way and extract information exactly the way it appears on the wikipedia site.
Run the wiki parser on it.
Or download (http://static.wikipedia.org/downloads/November_2006/en/) it parsed.
Matthew Flaschen
Wikitech-l mailing list Wikitech-l@lists.wikimedia.org http://lists.wikimedia.org/mailman/listinfo/wikitech-l