Last active
October 28, 2024 21:02
-
-
Save sickel/4c62a843ddb82f424c6327289fed1eaa to your computer and use it in GitHub Desktop.
Import of data from johnstonarchive nuclear tests
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from html.parser import HTMLParser | |
"""Download the specific lists from https://www.johnstonsarchive.net/nuclear/tests/index.html. | |
Save them with the same file name as the header (e.g. | |
"Database of nuclear tests, United States: part 1, 1945-1963") | |
run the script with the filename as argument and it will make a tab delimited text file.""" | |
filename=sys.argv[1] | |
outfile = filename+'.dat' | |
errfilename = filename + '.err' | |
#with open(filename) as inputfile: | |
# lines = inputfile.readlines() | |
months = ['','JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC'] | |
datafields = {'ID':[5,int],'SERIES':[23,str],'SHOT':[48,str],'YEAR':[52,int], | |
'MON':[57,str], 'DAY':[62,int], 'TIME':[76,str], 'SITE':[85,str], | |
'LAT':[94,float], 'LONG':[106,float], 'LOCCODE':[111,str], 'HOLE' :[124,str], | |
'HOB':[130,float], 'GZALT':[135,int], 'GZCODE':[139,str], 'TYPE':[146,str], | |
'PUR':[152,str], 'YD-MN':[157,float], 'YD-MX':[163,float], 'YD-EST':[168,str], | |
'NT':[174,str], 'YIELD':[180,str], 'MAG': [187,float], 'CRAT':[192,str], | |
'VENT': [198,str], 'DEVICE': [204,str], 'WARHEAD':[214,str], | |
'SPONSOR':[220,str], 'REPNOTES':[223,str], 'SOURCES':[-1,str]} | |
breakat = False | |
linebreak = 350 | |
country = 'USA' | |
if "United" in filename: | |
datafields['SITE'][0] -= 2 | |
datafields['SHOT'][0] -= 2 | |
datafields['HOLE'][0] -= 4 | |
datafields['PUR'][0] -= 1 | |
datafields['NT'][0] -= 2 | |
datafields['VENT'][0] += 5 | |
if 'USSR' in filename or 'India' in filename: | |
datafields['YD-MN'][0] += 1 | |
country = 'USSR' # For india and others, country set later on | |
datafields['GZCODE'][0] += 1 | |
datafields['VENT'][0] -= 1 | |
datafields['DEVICE'][0] -= 1 | |
if 'part 2' in filename or 'part 3' in filename: | |
datafields['VENT'][0] += 3 | |
if 'France' in filename: | |
datafields['LONG'][0] += 1 | |
datafields['LAT'][0] += 1 | |
datafields ['YIELD'][0] += 1 | |
datafields['PUR'][0] += 1 | |
datafields['YD-MX'][0] -= 1 | |
datafields['GZALT'][0] += 2 | |
datafields['HOB'][0] += 2 | |
datafields['TYPE'][0] += 1 | |
datafields['GZCODE'][0] += 1 | |
country = 'France' | |
if 'China' in filename: | |
datafields['YD-MX'][0] += 1 | |
datafields['YD-MN'][0] += 1 | |
datafields['VENT'][0] -= 1 | |
datafields['DEVICE'][0] -= 1 | |
country = 'China' | |
if 'Kingdom' in filename: | |
country = 'UK' | |
if 'India' in filename: | |
datafields['VENT'][0] -= 1 | |
# Country being set from data file | |
readdata = False | |
headerprinted = False | |
lineno = 1 | |
outputfile = open(outfile,'w') | |
errorfile = open(errfilename,'w') | |
inputfile = open(filename, 'r', encoding='cp1252') | |
for line in inputfile: | |
line = line.strip('\n') | |
line = line.replace('>','>') | |
line = line.replace('<','<') | |
if readdata: | |
lineno += 1 | |
print(f'--->{lineno}') | |
if line.startswith('***'): | |
line = line.replace('*',' ') | |
line = line.strip(' ') | |
country = line | |
continue | |
if '</pre>' in line: | |
readdata = False | |
continue | |
try: | |
int(line[0:5]) | |
except: | |
print(line) | |
continue | |
rec = {'line':line.replace(',',';')} | |
start = 0 | |
for field in datafields: | |
end = datafields[field][0] | |
if end > 0: | |
rec[field] = line[start:end].strip() | |
else: | |
rec[field] = line[start:].strip() | |
if field == 'MAG' and rec[field] == '4.80C': | |
rec[field] = '4.80' # Pretty sure this is an error in the Pakistan data set, nothing like this anywhere else | |
start = end | |
if rec[field] == '': | |
rec[field] = None | |
else: | |
try: | |
if not datafields[field][1] == str and rec[field][0] in ('<','>'): | |
rec[field+'sign'] =rec[field][:1] | |
rec[field] = datafields[field][1](rec[field][1:]) | |
else: | |
rec[field] = datafields[field][1](rec[field]) | |
except ValueError as e: | |
print(e) | |
print(line) | |
print(field) | |
sys.exit(3) | |
# Convert to correct type | |
if rec['ID']==None: | |
continue | |
try: | |
mndnum = months.index(rec['MON']) | |
except ValueError: | |
errorfile.write(line) | |
errorfile.write('\n') | |
continue | |
mndnum = str(mndnum).zfill(2) | |
rec['COUNTRY'] = country | |
rec['TIMESTAMP'] = f'{rec['YEAR']}{mndnum}{rec['DAY']}T{rec['TIME']}' | |
print(rec) | |
if breakat and lineno > linebreak: | |
pass | |
sys.exit() | |
data=[] | |
for key in rec: | |
data.append(str(rec[key])) | |
print(lineno) | |
if not headerprinted: | |
outputfile.write('\t'.join(rec.keys())) | |
outputfile.write('\n') | |
headerprinted = True | |
outputfile.write('\t'.join(data)) | |
outputfile.write('\n') | |
else: | |
readdata = True if '<pre>' in line else readdata | |
print(line) | |
continue | |
errorfile.close() | |
outputfile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Download the specific lists from https://www.johnstonsarchive.net/nuclear/tests/index.html. Save them with the same file name as the header (e.g. "Database of nuclear tests, United States: part 1, 1945-1963") run the python script with the filename as argument and it will make a tab delimited text file.