Skip to content

Instantly share code, notes, and snippets.

@sickel
Last active October 28, 2024 21:02
Show Gist options
  • Save sickel/4c62a843ddb82f424c6327289fed1eaa to your computer and use it in GitHub Desktop.
Save sickel/4c62a843ddb82f424c6327289fed1eaa to your computer and use it in GitHub Desktop.
Import of data from johnstonarchive nuclear tests
import sys
from html.parser import HTMLParser
"""Download the specific lists from https://www.johnstonsarchive.net/nuclear/tests/index.html.
Save them with the same file name as the header (e.g.
"Database of nuclear tests, United States: part 1, 1945-1963")
run the script with the filename as argument and it will make a tab delimited text file."""
filename=sys.argv[1]
outfile = filename+'.dat'
errfilename = filename + '.err'
#with open(filename) as inputfile:
# lines = inputfile.readlines()
months = ['','JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']
datafields = {'ID':[5,int],'SERIES':[23,str],'SHOT':[48,str],'YEAR':[52,int],
'MON':[57,str], 'DAY':[62,int], 'TIME':[76,str], 'SITE':[85,str],
'LAT':[94,float], 'LONG':[106,float], 'LOCCODE':[111,str], 'HOLE' :[124,str],
'HOB':[130,float], 'GZALT':[135,int], 'GZCODE':[139,str], 'TYPE':[146,str],
'PUR':[152,str], 'YD-MN':[157,float], 'YD-MX':[163,float], 'YD-EST':[168,str],
'NT':[174,str], 'YIELD':[180,str], 'MAG': [187,float], 'CRAT':[192,str],
'VENT': [198,str], 'DEVICE': [204,str], 'WARHEAD':[214,str],
'SPONSOR':[220,str], 'REPNOTES':[223,str], 'SOURCES':[-1,str]}
breakat = False
linebreak = 350
country = 'USA'
if "United" in filename:
datafields['SITE'][0] -= 2
datafields['SHOT'][0] -= 2
datafields['HOLE'][0] -= 4
datafields['PUR'][0] -= 1
datafields['NT'][0] -= 2
datafields['VENT'][0] += 5
if 'USSR' in filename or 'India' in filename:
datafields['YD-MN'][0] += 1
country = 'USSR' # For india and others, country set later on
datafields['GZCODE'][0] += 1
datafields['VENT'][0] -= 1
datafields['DEVICE'][0] -= 1
if 'part 2' in filename or 'part 3' in filename:
datafields['VENT'][0] += 3
if 'France' in filename:
datafields['LONG'][0] += 1
datafields['LAT'][0] += 1
datafields ['YIELD'][0] += 1
datafields['PUR'][0] += 1
datafields['YD-MX'][0] -= 1
datafields['GZALT'][0] += 2
datafields['HOB'][0] += 2
datafields['TYPE'][0] += 1
datafields['GZCODE'][0] += 1
country = 'France'
if 'China' in filename:
datafields['YD-MX'][0] += 1
datafields['YD-MN'][0] += 1
datafields['VENT'][0] -= 1
datafields['DEVICE'][0] -= 1
country = 'China'
if 'Kingdom' in filename:
country = 'UK'
if 'India' in filename:
datafields['VENT'][0] -= 1
# Country being set from data file
readdata = False
headerprinted = False
lineno = 1
outputfile = open(outfile,'w')
errorfile = open(errfilename,'w')
inputfile = open(filename, 'r', encoding='cp1252')
for line in inputfile:
line = line.strip('\n')
line = line.replace('>','>')
line = line.replace('&lt;','<')
if readdata:
lineno += 1
print(f'--->{lineno}')
if line.startswith('***'):
line = line.replace('*',' ')
line = line.strip(' ')
country = line
continue
if '</pre>' in line:
readdata = False
continue
try:
int(line[0:5])
except:
print(line)
continue
rec = {'line':line.replace(',',';')}
start = 0
for field in datafields:
end = datafields[field][0]
if end > 0:
rec[field] = line[start:end].strip()
else:
rec[field] = line[start:].strip()
if field == 'MAG' and rec[field] == '4.80C':
rec[field] = '4.80' # Pretty sure this is an error in the Pakistan data set, nothing like this anywhere else
start = end
if rec[field] == '':
rec[field] = None
else:
try:
if not datafields[field][1] == str and rec[field][0] in ('<','>'):
rec[field+'sign'] =rec[field][:1]
rec[field] = datafields[field][1](rec[field][1:])
else:
rec[field] = datafields[field][1](rec[field])
except ValueError as e:
print(e)
print(line)
print(field)
sys.exit(3)
# Convert to correct type
if rec['ID']==None:
continue
try:
mndnum = months.index(rec['MON'])
except ValueError:
errorfile.write(line)
errorfile.write('\n')
continue
mndnum = str(mndnum).zfill(2)
rec['COUNTRY'] = country
rec['TIMESTAMP'] = f'{rec['YEAR']}{mndnum}{rec['DAY']}T{rec['TIME']}'
print(rec)
if breakat and lineno > linebreak:
pass
sys.exit()
data=[]
for key in rec:
data.append(str(rec[key]))
print(lineno)
if not headerprinted:
outputfile.write('\t'.join(rec.keys()))
outputfile.write('\n')
headerprinted = True
outputfile.write('\t'.join(data))
outputfile.write('\n')
else:
readdata = True if '<pre>' in line else readdata
print(line)
continue
errorfile.close()
outputfile.close()
@sickel
Copy link
Author

sickel commented Oct 28, 2024

Download the specific lists from https://www.johnstonsarchive.net/nuclear/tests/index.html. Save them with the same file name as the header (e.g. "Database of nuclear tests, United States: part 1, 1945-1963") run the python script with the filename as argument and it will make a tab delimited text file.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment