|
import glob |
|
import re |
|
import os |
|
import sys |
|
|
|
#### |
|
|
|
|
|
class Chapter: |
|
def __init__(self, num: int, data: Iterator[str]): |
|
self.num = str(num) |
|
self.data = data |
|
|
|
def parse(self): |
|
words = {} |
|
nums = [] |
|
prefix = [] |
|
# read section of the chapter |
|
first_line = True |
|
is_comment = False |
|
is_formula = False |
|
|
|
# read line by line |
|
while True: |
|
# read next line |
|
line = next(self.data, None) |
|
|
|
# stop when we reached the end |
|
if line is None: |
|
break |
|
|
|
# skip first line |
|
if first_line: |
|
first_line = False |
|
continue |
|
|
|
# skip comments |
|
if is_comment: |
|
if len(line) >= 4 and line[:4] == "////": |
|
is_comment = False |
|
else: |
|
continue |
|
else: |
|
if len(line) >= 4 and line[:4] == "////": |
|
is_comment = True |
|
continue |
|
|
|
# skip formulas |
|
if is_formula: |
|
if len(line) >= 4 and line[:4] == "****": |
|
is_formula = False |
|
else: |
|
continue |
|
else: |
|
if len(line) >= 4 and line[:4] == "****": |
|
is_formula = True |
|
continue |
|
|
|
# retrieve information from titles only (e.g. ==== TITLE) |
|
depth = count_equal(line) |
|
is_title = depth > 0 |
|
|
|
if is_title: |
|
# get section name |
|
raw_name = line.rsplit("=", 1)[1].rstrip("\n") |
|
name = simplify_name(raw_name) |
|
|
|
# ignore empty titles |
|
if name == "": |
|
continue |
|
|
|
# handle |
|
match depth: |
|
# main title: initialization |
|
case 1: |
|
assert(len(prefix) == 0) |
|
assert(len(words) == 0) |
|
nums = [self.num] |
|
prefix = [name] |
|
|
|
# nesting |
|
case x if x > len(prefix): |
|
nums.append(1) |
|
prefix.append(name) |
|
|
|
# unesting |
|
case x if x < len(prefix): |
|
nums.pop() |
|
nums[-1] += 1 |
|
prefix.pop() |
|
prefix[-1] = name |
|
|
|
# same prefix |
|
case x: |
|
nums[-1] += 1 |
|
prefix[-1] = name |
|
|
|
path = path_from(nums, prefix) |
|
assert(path not in words) |
|
words[path] = 0 |
|
|
|
# |
|
continue |
|
|
|
# if we're still not initialized, skip |
|
if len(prefix) == 0: |
|
continue |
|
|
|
# a normal line: count the words |
|
path = path_from(nums, prefix) |
|
words[path] += count_words(line) |
|
|
|
# |
|
return words |
|
|
|
|
|
def path_from(nums, prefix): |
|
nums = [str(num) for num in nums] |
|
|
|
# trick to make sure chapters are sorted |
|
if len(nums[0]) == 1: |
|
nums[0] = "0" + nums[0] |
|
|
|
path = [] |
|
for depth, pref in enumerate(prefix): |
|
num = ".".join(nums[:depth+1]) |
|
path.append(num + " " + pref) |
|
|
|
return ";".join(path) |
|
|
|
|
|
def simplify_name(name: str): |
|
res = name |
|
res = res.strip(" ") |
|
# res = res.replace(" ", "_") |
|
return res |
|
|
|
#### |
|
|
|
|
|
def count_equal(line): |
|
len_line = len(line) |
|
top = min(len_line, 5) |
|
res = 0 |
|
for i in range(top): |
|
if line[i] == "=": |
|
res += 1 |
|
else: |
|
break |
|
return res |
|
|
|
|
|
def count_words(line): |
|
return len(line.split(" ")) |
|
|
|
|
|
def parse_chapter(chapt_num, chapter): |
|
section_depth = {0: chapt_num} |
|
|
|
# read section of the chapter |
|
words = 0 |
|
first_line = True |
|
is_comment = False |
|
is_formula = False |
|
for line in chapter: |
|
|
|
# skip first line |
|
if first_line: |
|
first_line = False |
|
continue |
|
|
|
# skip comments |
|
if is_comment: |
|
if len(line) >= 4 and line[:4] == "////": |
|
is_comment = False |
|
else: |
|
continue |
|
else: |
|
if len(line) >= 4 and line[:4] == "////": |
|
is_comment = True |
|
continue |
|
|
|
# skip formulas |
|
if is_formula: |
|
if len(line) >= 4 and line[:4] == "****": |
|
is_formula = False |
|
else: |
|
continue |
|
else: |
|
if len(line) >= 4 and line[:4] == "****": |
|
is_formula = True |
|
continue |
|
|
|
# retrieve information from titles only (e.g. ==== TITLE) |
|
depth = count_equal(line) |
|
is_title = depth > 0 |
|
|
|
if is_title: |
|
name = line.rsplit("=", 1)[1].rstrip("\n") |
|
# ignore empty titles |
|
if name == "": |
|
continue |
|
|
|
# recurse |
|
section_depth = print_section_number(section_depth, depth-1, name) |
|
continue |
|
|
|
# a normal line, count the words |
|
words += count_words(line) |
|
|
|
|
|
def print_section_number(section_depth, depth, name): |
|
# add in dictionary |
|
if depth in section_depth: |
|
section_depth[depth] += 1 |
|
else: |
|
section_depth[depth] = 1 |
|
section_depth = clear_depth(section_depth, depth) |
|
|
|
# format |
|
fmt = "" |
|
for i in range(depth+1): |
|
# print("access", i) |
|
fmt += str(section_depth[i]) + "." |
|
print("-" * depth + fmt + name) |
|
return section_depth |
|
|
|
|
|
def clear_depth(section_depth, depth): |
|
while True: |
|
depth += 1 |
|
if depth in section_depth: |
|
section_depth[depth] = 0 |
|
else: |
|
break |
|
return section_depth |
|
|
|
|
|
def main(): |
|
# parse arguments |
|
flamegraph = False # generate a flamegraph |
|
verbose = False |
|
|
|
if len(sys.argv) > 1: |
|
if sys.argv[1] == "flamegraph": |
|
flamegraph = True |
|
if sys.argv[1] == "verbose": |
|
verbose = True |
|
|
|
# read chapter files sorted by their number |
|
filenames = glob.glob("./manuscript/*_*.adoc") |
|
filenames = sorted(filenames, key=lambda line: int( |
|
os.path.basename(line).split("_", 1)[0])) |
|
|
|
# parse each file one by one |
|
for filename in filenames: |
|
with open(filename, 'r') as f: |
|
# get filename |
|
name = os.path.basename(filename) |
|
if verbose: |
|
print(name) |
|
|
|
# get chapter number |
|
chapt_num = int(name.split("_", 1)[0]) |
|
|
|
# magic |
|
if flamegraph: |
|
chapter = Chapter(chapt_num, iter(f)) |
|
res = chapter.parse() |
|
for key in res: |
|
print(key, res[key]) |
|
|
|
else: |
|
parse_chapter(chapt_num-1, f) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |