$ cat dazai.txt | python markov.py
私はこれまで、こんな不思議な表情の子供を見た事が、その子供の笑顔は、最も奇怪なのである。
そうして、ただもう不愉快、イライラしていた。
軽薄と言っても足りない。
あ、こんな不思議な美貌の学生にも、まんざら空お世辞に聞えないくらいの、まことに奇妙な、そうして、どこか怪談じみた気味悪いものが感ぜられて来るのである。
所謂「死相」というものになるであろうか、思い出した、というようなよろこびさえ無い。
Last active
June 18, 2020 03:07
-
-
Save okapies/3eaf220bcdba2b8dd0893d329549d099 to your computer and use it in GitHub Desktop.
Japanese text generator using Markov chain algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# ref. https://qiita.com/k-jimon/items/f02fae75e853a9c02127 | |
from collections import deque, defaultdict, Counter | |
from itertools import chain, islice, takewhile | |
import MeCab | |
import os | |
import random | |
import re | |
import sys | |
from typing import Deque, Dict, Iterable, List, Tuple | |
ORDER = 2 | |
SENTENCE_NUM = 5 | |
MECAB_OPTS = ' '.join([ | |
'-Owakati', | |
'-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd', | |
]) | |
SPACES = re.compile('[\s]') | |
TrainingModel = Dict[Tuple[str, ...], List[str]] | |
InferenceModel = Dict[Tuple[str, ...], Tuple[List[str], List[int]]] | |
def train_model( | |
texts: Iterable[str], | |
order: int, | |
tagger: MeCab.Tagger, | |
) -> TrainingModel: | |
model: Dict[Tuple[str, ...], list] = defaultdict(list) | |
words = wakati(texts, tagger) | |
head_words = ['[BOS]'] + list(islice(words, order - 1)) | |
queue = deque(head_words, order) | |
for markov_value in words: | |
if queue[-1] == '。': | |
update_model(model, queue, '[BOS]') | |
update_model(model, queue, markov_value) | |
return model | |
def wakati(texts: Iterable[str], tagger: MeCab.Tagger) -> Iterable[str]: | |
return ( | |
chain.from_iterable( | |
tagger.parse(SPACES.sub('', sentence)).rstrip('\n').split(' ') | |
for text | |
in texts | |
for sentence | |
in re.findall(".*?。", text) | |
) | |
) | |
def update_model(model: TrainingModel, queue: Deque[str], value: str) -> None: | |
markov_key = tuple(queue) | |
model[markov_key].append(value) | |
queue.append(value) | |
def optimize_model(model: TrainingModel) -> InferenceModel: | |
return { | |
k: tuple(zip(*Counter(v).items())) # type: ignore | |
for k, v | |
in model.items() | |
} | |
def gen_sentences( | |
model: InferenceModel, | |
order: int, | |
sentence_num: int, | |
seed: str = '[BOS]', | |
max_words: int = 100, | |
) -> Iterable[str]: | |
key_candidates = [key for key in model if key[0] == seed] | |
if not key_candidates: | |
print('keyword not found', file=sys.stderr) | |
return iter([]) | |
markov_key = random.choice(key_candidates) | |
queue = deque(list(markov_key), order) | |
return (gen_sentence(model, queue, max_words) for _ in range(sentence_num)) | |
def gen_sentence( | |
model: InferenceModel, | |
queue: Deque[str], | |
max_words: int, | |
) -> str: | |
return ''.join( | |
list( | |
filter( | |
lambda w: w not in ['[BOS]'], | |
takewhile( | |
lambda w: w != '。', | |
(gen_word(model, queue) for _ in range(max_words)) | |
) | |
) | |
) + ['。'] | |
) | |
def gen_word(model: InferenceModel, queue: Deque[str]) -> str: | |
markov_key = tuple(queue) | |
next_word = random.choices(*model[markov_key])[0] | |
queue.append(next_word) | |
return markov_key[0] | |
def main(): | |
tagger = MeCab.Tagger(MECAB_OPTS) | |
text = ''.join(l for l in sys.stdin) | |
model = train_model([text], ORDER, tagger) | |
model = optimize_model(model) | |
sentences = gen_sentences(model, ORDER, SENTENCE_NUM) | |
for sentence in sentences: | |
print(sentence) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment