Skip to content

Instantly share code, notes, and snippets.

@jrwren
Created November 27, 2024 21:48
Show Gist options
  • Save jrwren/1cffa25134ba6a39451c40a05e2c765f to your computer and use it in GitHub Desktop.
Save jrwren/1cffa25134ba6a39451c40a05e2c765f to your computer and use it in GitHub Desktop.
parse large wikiquote export efficiently
package wikiquote
import (
"encoding/xml"
"iter"
"log"
"os"
"strings"
)
var Filename = "wikiquote.xml"
func Titles() (titles []string) {
src, err := os.Open(Filename)
if err != nil {
log.Fatal(err)
}
defer src.Close()
d := xml.NewDecoder(src)
var page Page
for t, _ := d.Token(); t != nil; t, _ = d.Token() {
switch se := t.(type) {
case xml.StartElement:
if se.Name.Local == "page" {
d.DecodeElement(&page, &se)
titles = append(titles, page.Title)
}
}
}
return titles
}
func Pages() iter.Seq[*Page] {
return func(yield func(*Page) bool) {
src, err := os.Open(Filename)
if err != nil {
log.Fatal(err)
}
defer src.Close()
d := xml.NewDecoder(src)
for t, _ := d.Token(); t != nil; t, _ = d.Token() {
switch se := t.(type) {
case xml.StartElement:
if se.Name.Local == "page" {
var page Page
err = d.DecodeElement(&page, &se)
if err != nil {
log.Fatal(err)
}
if !yield(&page) {
return
}
}
}
}
}
}
func (p *Page) Quotes() []string {
var quotes []string
text := p.Revision.Text.Text
for _, line := range strings.Split(text, "\n") {
if strings.HasPrefix(line, "*") {
quotes = append(quotes, line)
}
}
return quotes
}
// Mediawiki was generated 2024-11-05 14:51:17 by https://xml-to-go.github.io/ in Ukraine.
type Mediawiki struct {
XMLName xml.Name `xml:"mediawiki"`
Text string `xml:",chardata"`
Xmlns string `xml:"xmlns,attr"`
Xsi string `xml:"xsi,attr"`
SchemaLocation string `xml:"schemaLocation,attr"`
Version string `xml:"version,attr"`
Lang string `xml:"lang,attr"`
Siteinfo struct {
Text string `xml:",chardata"`
Sitename string `xml:"sitename"`
Dbname string `xml:"dbname"`
Base string `xml:"base"`
Generator string `xml:"generator"`
Case string `xml:"case"`
Namespaces struct {
Text string `xml:",chardata"`
Namespace []struct {
Text string `xml:",chardata"`
Key string `xml:"key,attr"`
Case string `xml:"case,attr"`
} `xml:"namespace"`
} `xml:"namespaces"`
} `xml:"siteinfo"`
Page []Page `xml:"page"`
}
type Page struct {
Text string `xml:",chardata"`
Title string `xml:"title"`
Revision struct {
Chardata string `xml:",chardata"`
Text struct {
Text string `xml:",chardata"`
Bytes string `xml:"bytes,attr"`
Sha1 string `xml:"sha1,attr"`
Space string `xml:"space,attr"`
} `xml:"text"`
} `xml:"revision"`
}
type FullPage struct {
Text string `xml:",chardata"`
Title string `xml:"title"`
Ns string `xml:"ns"`
ID string `xml:"id"`
Revision struct {
Chardata string `xml:",chardata"`
ID string `xml:"id"`
Parentid string `xml:"parentid"`
Timestamp string `xml:"timestamp"`
Contributor struct {
Text string `xml:",chardata"`
Username string `xml:"username"`
ID string `xml:"id"`
} `xml:"contributor"`
Comment string `xml:"comment"`
Origin string `xml:"origin"`
Model string `xml:"model"`
Format string `xml:"format"`
Text struct {
Text string `xml:",chardata"`
Bytes string `xml:"bytes,attr"`
Sha1 string `xml:"sha1,attr"`
Space string `xml:"space,attr"`
} `xml:"text"`
Sha1 string `xml:"sha1"`
} `xml:"revision"`
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment