Created
November 27, 2024 21:48
-
-
Save jrwren/1cffa25134ba6a39451c40a05e2c765f to your computer and use it in GitHub Desktop.
parse large wikiquote export efficiently
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package wikiquote | |
import ( | |
"encoding/xml" | |
"iter" | |
"log" | |
"os" | |
"strings" | |
) | |
var Filename = "wikiquote.xml" | |
func Titles() (titles []string) { | |
src, err := os.Open(Filename) | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer src.Close() | |
d := xml.NewDecoder(src) | |
var page Page | |
for t, _ := d.Token(); t != nil; t, _ = d.Token() { | |
switch se := t.(type) { | |
case xml.StartElement: | |
if se.Name.Local == "page" { | |
d.DecodeElement(&page, &se) | |
titles = append(titles, page.Title) | |
} | |
} | |
} | |
return titles | |
} | |
func Pages() iter.Seq[*Page] { | |
return func(yield func(*Page) bool) { | |
src, err := os.Open(Filename) | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer src.Close() | |
d := xml.NewDecoder(src) | |
for t, _ := d.Token(); t != nil; t, _ = d.Token() { | |
switch se := t.(type) { | |
case xml.StartElement: | |
if se.Name.Local == "page" { | |
var page Page | |
err = d.DecodeElement(&page, &se) | |
if err != nil { | |
log.Fatal(err) | |
} | |
if !yield(&page) { | |
return | |
} | |
} | |
} | |
} | |
} | |
} | |
func (p *Page) Quotes() []string { | |
var quotes []string | |
text := p.Revision.Text.Text | |
for _, line := range strings.Split(text, "\n") { | |
if strings.HasPrefix(line, "*") { | |
quotes = append(quotes, line) | |
} | |
} | |
return quotes | |
} | |
// Mediawiki was generated 2024-11-05 14:51:17 by https://xml-to-go.github.io/ in Ukraine. | |
type Mediawiki struct { | |
XMLName xml.Name `xml:"mediawiki"` | |
Text string `xml:",chardata"` | |
Xmlns string `xml:"xmlns,attr"` | |
Xsi string `xml:"xsi,attr"` | |
SchemaLocation string `xml:"schemaLocation,attr"` | |
Version string `xml:"version,attr"` | |
Lang string `xml:"lang,attr"` | |
Siteinfo struct { | |
Text string `xml:",chardata"` | |
Sitename string `xml:"sitename"` | |
Dbname string `xml:"dbname"` | |
Base string `xml:"base"` | |
Generator string `xml:"generator"` | |
Case string `xml:"case"` | |
Namespaces struct { | |
Text string `xml:",chardata"` | |
Namespace []struct { | |
Text string `xml:",chardata"` | |
Key string `xml:"key,attr"` | |
Case string `xml:"case,attr"` | |
} `xml:"namespace"` | |
} `xml:"namespaces"` | |
} `xml:"siteinfo"` | |
Page []Page `xml:"page"` | |
} | |
type Page struct { | |
Text string `xml:",chardata"` | |
Title string `xml:"title"` | |
Revision struct { | |
Chardata string `xml:",chardata"` | |
Text struct { | |
Text string `xml:",chardata"` | |
Bytes string `xml:"bytes,attr"` | |
Sha1 string `xml:"sha1,attr"` | |
Space string `xml:"space,attr"` | |
} `xml:"text"` | |
} `xml:"revision"` | |
} | |
type FullPage struct { | |
Text string `xml:",chardata"` | |
Title string `xml:"title"` | |
Ns string `xml:"ns"` | |
ID string `xml:"id"` | |
Revision struct { | |
Chardata string `xml:",chardata"` | |
ID string `xml:"id"` | |
Parentid string `xml:"parentid"` | |
Timestamp string `xml:"timestamp"` | |
Contributor struct { | |
Text string `xml:",chardata"` | |
Username string `xml:"username"` | |
ID string `xml:"id"` | |
} `xml:"contributor"` | |
Comment string `xml:"comment"` | |
Origin string `xml:"origin"` | |
Model string `xml:"model"` | |
Format string `xml:"format"` | |
Text struct { | |
Text string `xml:",chardata"` | |
Bytes string `xml:"bytes,attr"` | |
Sha1 string `xml:"sha1,attr"` | |
Space string `xml:"space,attr"` | |
} `xml:"text"` | |
Sha1 string `xml:"sha1"` | |
} `xml:"revision"` | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment