Last active
November 15, 2018 17:33
-
-
Save eezis/6ed68ff4036de66fd32479a5977e8d04 to your computer and use it in GitHub Desktop.
Udacity Nanotrading Transcript Cleaner
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Turn the .srt files into .txt files | |
1. unzip the lesson files into a directory | |
2. run this code in that directory. | |
the code will strip out the timestamps and carriage returns | |
and produce a text file that you can use for reference and class notes. | |
""" | |
import re | |
from os import listdir, getcwd | |
from os.path import isfile, join | |
currpath = getcwd() | |
thefiles = [f for f in listdir(currpath) if isfile(join(currpath, f))] | |
for f in thefiles: | |
file = open(f, mode='r', encoding = "ISO-8859-1") | |
# read it all | |
raw_text = file.read() | |
raw_text = re.sub('^\d.*?$', '', raw_text, flags=re.M|re.S) | |
raw_text = re.sub('\n\n\n\n', ' ', raw_text, flags=re.M|re.S) | |
fixed = open(f.replace('.srt', '.txt') , mode='w') | |
fixed.write(raw_text) | |
print('done') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment