mimoo · March 8, 2021 04:48
diff --git a/README.md b/README.md
diff --git a/get_sections.py b/get_sections.py
 import glob
 import re
 import os
 import sys

 ####


 class Chapter:
    def __init__(self, num: int, data: Iterator[str]):
        self.num = str(num)
        self.data = data

    def parse(self):
        words = {}
        nums = []
        prefix = []
        # read section of the chapter
        first_line = True
        is_comment = False
        is_formula = False

        # read line by line
        while True:
            # read next line
            line = next(self.data, None)

            # stop when we reached the end
            if line is None:
                break

            # skip first line
            if first_line:
                first_line = False
                continue

            # skip comments
            if is_comment:
                if len(line) >= 4 and line[:4] == "////":
                    is_comment = False
                else:
                    continue
            else:
                if len(line) >= 4 and line[:4] == "////":
                    is_comment = True
                    continue

            # skip formulas
            if is_formula:
                if len(line) >= 4 and line[:4] == "****":
                    is_formula = False
                else:
                    continue
            else:
                if len(line) >= 4 and line[:4] == "****":
                    is_formula = True
                    continue

            # retrieve information from titles only (e.g. ==== TITLE)
            depth = count_equal(line)
            is_title = depth > 0

            if is_title:
                # get section name
                raw_name = line.rsplit("=", 1)[1].rstrip("\n")
                name = simplify_name(raw_name)

                # ignore empty titles
                if name == "":
                    continue

                # handle
                match depth:
                    # main title: initialization
                    case 1:
                        assert(len(prefix) == 0)
                        assert(len(words) == 0)
                        nums = [self.num]
                        prefix = [name]

                    # nesting
                    case x if x > len(prefix):
                        nums.append(1)
                        prefix.append(name)

                    # unesting
                    case x if x < len(prefix):
                        nums.pop()
                        nums[-1] += 1
                        prefix.pop()
                        prefix[-1] = name

                    # same prefix
                    case x:
                        nums[-1] += 1
                        prefix[-1] = name

                path = path_from(nums, prefix)
                assert(path not in words)
                words[path] = 0

                #
                continue

            # if we're still not initialized, skip
            if len(prefix) == 0:
                continue

            # a normal line: count the words
            path = path_from(nums, prefix)
            words[path] += count_words(line)

        #
        return words


 def path_from(nums, prefix):
    nums = [str(num) for num in nums]

    # trick to make sure chapters are sorted
    if len(nums[0]) == 1:
        nums[0] = "0" + nums[0]

    path = []
    for depth, pref in enumerate(prefix):
        num = ".".join(nums[:depth+1])
        path.append(num + " " + pref)

    return ";".join(path)


 def simplify_name(name: str):
    res = name
    res = res.strip(" ")
 #    res = res.replace(" ", "_")
    return res

 ####


 def count_equal(line):
    len_line = len(line)
    top = min(len_line, 5)
    res = 0
    for i in range(top):
        if line[i] == "=":
            res += 1
        else:
            break
    return res


 def count_words(line):
    return len(line.split(" "))


 def parse_chapter(chapt_num, chapter):
    section_depth = {0: chapt_num}

    # read section of the chapter
    words = 0
    first_line = True
    is_comment = False
    is_formula = False
    for line in chapter:

        # skip first line
        if first_line:
            first_line = False
            continue

        # skip comments
        if is_comment:
            if len(line) >= 4 and line[:4] == "////":
                is_comment = False
            else:
                continue
        else:
            if len(line) >= 4 and line[:4] == "////":
                is_comment = True
                continue

        # skip formulas
        if is_formula:
            if len(line) >= 4 and line[:4] == "****":
                is_formula = False
            else:
                continue
        else:
            if len(line) >= 4 and line[:4] == "****":
                is_formula = True
                continue

        # retrieve information from titles only (e.g. ==== TITLE)
        depth = count_equal(line)
        is_title = depth > 0

        if is_title:
            name = line.rsplit("=", 1)[1].rstrip("\n")
            # ignore empty titles
            if name == "":
                continue

            # recurse
            section_depth = print_section_number(section_depth, depth-1, name)
            continue

        # a normal line, count the words
        words += count_words(line)


 def print_section_number(section_depth, depth, name):
    # add in dictionary
    if depth in section_depth:
        section_depth[depth] += 1
    else:
        section_depth[depth] = 1
    section_depth = clear_depth(section_depth, depth)

    # format
    fmt = ""
    for i in range(depth+1):
        #        print("access", i)
        fmt += str(section_depth[i]) + "."
    print("-" * depth + fmt + name)
    return section_depth


 def clear_depth(section_depth, depth):
    while True:
        depth += 1
        if depth in section_depth:
            section_depth[depth] = 0
        else:
            break
    return section_depth


 def main():
    # parse arguments
    flamegraph = False  # generate a flamegraph
    verbose = False

    if len(sys.argv) > 1:
        if sys.argv[1] == "flamegraph":
            flamegraph = True
        if sys.argv[1] == "verbose":
            verbose = True

    # read chapter files sorted by their number
    filenames = glob.glob("./manuscript/*_*.adoc")
    filenames = sorted(filenames, key=lambda line: int(
        os.path.basename(line).split("_", 1)[0]))

    # parse each file one by one
    for filename in filenames:
        with open(filename, 'r') as f:
            # get filename
            name = os.path.basename(filename)
            if verbose:
                print(name)

            # get chapter number
            chapt_num = int(name.split("_", 1)[0])

            # magic
            if flamegraph:
                chapter = Chapter(chapt_num, iter(f))
                res = chapter.parse()
                for key in res:
                    print(key, res[key])

            else:
                parse_chapter(chapt_num-1, f)


 if __name__ == "__main__":
    main()
	import glob
	import re
	import os
	import sys

	####


	class Chapter:
	def __init__(self, num: int, data: Iterator[str]):
	self.num = str(num)
	self.data = data

	def parse(self):
	words = {}
	nums = []
	prefix = []
	# read section of the chapter
	first_line = True
	is_comment = False
	is_formula = False

	# read line by line
	while True:
	# read next line
	line = next(self.data, None)

	# stop when we reached the end
	if line is None:
	break

	# skip first line
	if first_line:
	first_line = False
	continue

	# skip comments
	if is_comment:
	if len(line) >= 4 and line[:4] == "////":
	is_comment = False
	else:
	continue
	else:
	if len(line) >= 4 and line[:4] == "////":
	is_comment = True
	continue

	# skip formulas
	if is_formula:
	if len(line) >= 4 and line[:4] == "****":
	is_formula = False
	else:
	continue
	else:
	if len(line) >= 4 and line[:4] == "****":
	is_formula = True
	continue

	# retrieve information from titles only (e.g. ==== TITLE)
	depth = count_equal(line)
	is_title = depth > 0

	if is_title:
	# get section name
	raw_name = line.rsplit("=", 1)[1].rstrip("\n")
	name = simplify_name(raw_name)

	# ignore empty titles
	if name == "":
	continue

	# handle
	match depth:
	# main title: initialization
	case 1:
	assert(len(prefix) == 0)
	assert(len(words) == 0)
	nums = [self.num]
	prefix = [name]

	# nesting
	case x if x > len(prefix):
	nums.append(1)
	prefix.append(name)

	# unesting
	case x if x < len(prefix):
	nums.pop()
	nums[-1] += 1
	prefix.pop()
	prefix[-1] = name

	# same prefix
	case x:
	nums[-1] += 1
	prefix[-1] = name

	path = path_from(nums, prefix)
	assert(path not in words)
	words[path] = 0

	#
	continue

	# if we're still not initialized, skip
	if len(prefix) == 0:
	continue

	# a normal line: count the words
	path = path_from(nums, prefix)
	words[path] += count_words(line)

	#
	return words


	def path_from(nums, prefix):
	nums = [str(num) for num in nums]

	# trick to make sure chapters are sorted
	if len(nums[0]) == 1:
	nums[0] = "0" + nums[0]

	path = []
	for depth, pref in enumerate(prefix):
	num = ".".join(nums[:depth+1])
	path.append(num + " " + pref)

	return ";".join(path)


	def simplify_name(name: str):
	res = name
	res = res.strip(" ")
	# res = res.replace(" ", "_")
	return res

	####


	def count_equal(line):
	len_line = len(line)
	top = min(len_line, 5)
	res = 0
	for i in range(top):
	if line[i] == "=":
	res += 1
	else:
	break
	return res


	def count_words(line):
	return len(line.split(" "))


	def parse_chapter(chapt_num, chapter):
	section_depth = {0: chapt_num}

	# read section of the chapter
	words = 0
	first_line = True
	is_comment = False
	is_formula = False
	for line in chapter:

	# skip first line
	if first_line:
	first_line = False
	continue

	# skip comments
	if is_comment:
	if len(line) >= 4 and line[:4] == "////":
	is_comment = False
	else:
	continue
	else:
	if len(line) >= 4 and line[:4] == "////":
	is_comment = True
	continue

	# skip formulas
	if is_formula:
	if len(line) >= 4 and line[:4] == "****":
	is_formula = False
	else:
	continue
	else:
	if len(line) >= 4 and line[:4] == "****":
	is_formula = True
	continue

	# retrieve information from titles only (e.g. ==== TITLE)
	depth = count_equal(line)
	is_title = depth > 0

	if is_title:
	name = line.rsplit("=", 1)[1].rstrip("\n")
	# ignore empty titles
	if name == "":
	continue

	# recurse
	section_depth = print_section_number(section_depth, depth-1, name)
	continue

	# a normal line, count the words
	words += count_words(line)


	def print_section_number(section_depth, depth, name):
	# add in dictionary
	if depth in section_depth:
	section_depth[depth] += 1
	else:
	section_depth[depth] = 1
	section_depth = clear_depth(section_depth, depth)

	# format
	fmt = ""
	for i in range(depth+1):
	# print("access", i)
	fmt += str(section_depth[i]) + "."
	print("-" * depth + fmt + name)
	return section_depth


	def clear_depth(section_depth, depth):
	while True:
	depth += 1
	if depth in section_depth:
	section_depth[depth] = 0
	else:
	break
	return section_depth


	def main():
	# parse arguments
	flamegraph = False # generate a flamegraph
	verbose = False

	if len(sys.argv) > 1:
	if sys.argv[1] == "flamegraph":
	flamegraph = True
	if sys.argv[1] == "verbose":
	verbose = True

	# read chapter files sorted by their number
	filenames = glob.glob("./manuscript/_.adoc")
	filenames = sorted(filenames, key=lambda line: int(
	os.path.basename(line).split("_", 1)[0]))

	# parse each file one by one
	for filename in filenames:
	with open(filename, 'r') as f:
	# get filename
	name = os.path.basename(filename)
	if verbose:
	print(name)

	# get chapter number
	chapt_num = int(name.split("_", 1)[0])

	# magic
	if flamegraph:
	chapter = Chapter(chapt_num, iter(f))
	res = chapter.parse()
	for key in res:
	print(key, res[key])

	else:
	parse_chapter(chapt_num-1, f)


	if __name__ == "__main__":
	main()