sickel · October 28, 2024 21:02 · sickel · Oct 28, 2024
diff --git a/johnstoncsv.py b/johnstoncsv.py
 import sys
 from html.parser import HTMLParser

 """Download the specific lists from https://www.johnstonsarchive.net/nuclear/tests/index.html. 
 Save them with the same file name as the header (e.g. 
 "Database of nuclear tests, United States: part 1, 1945-1963") 
 run the script with the filename as argument and it will make a tab delimited text file."""

 filename=sys.argv[1]
 outfile = filename+'.dat'
 errfilename = filename + '.err'
 #with open(filename) as inputfile:
 #    lines = inputfile.readlines()
    
 months = ['','JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']

 datafields = {'ID':[5,int],'SERIES':[23,str],'SHOT':[48,str],'YEAR':[52,int],
  'MON':[57,str], 'DAY':[62,int], 'TIME':[76,str], 'SITE':[85,str], 
  'LAT':[94,float], 'LONG':[106,float], 'LOCCODE':[111,str], 'HOLE' :[124,str],
  'HOB':[130,float], 'GZALT':[135,int], 'GZCODE':[139,str], 'TYPE':[146,str], 
  'PUR':[152,str], 'YD-MN':[157,float], 'YD-MX':[163,float], 'YD-EST':[168,str], 
  'NT':[174,str], 'YIELD':[180,str], 'MAG': [187,float], 'CRAT':[192,str], 
  'VENT': [198,str], 'DEVICE': [204,str], 'WARHEAD':[214,str],
  'SPONSOR':[220,str], 'REPNOTES':[223,str], 'SOURCES':[-1,str]}

 breakat = False
 linebreak = 350

 country = 'USA'

 if "United" in filename:
    datafields['SITE'][0] -= 2
    datafields['SHOT'][0] -= 2 
    datafields['HOLE'][0] -= 4
    datafields['PUR'][0] -= 1
    datafields['NT'][0] -= 2
    datafields['VENT'][0] += 5

 if 'USSR' in filename or 'India' in filename:
    datafields['YD-MN'][0] += 1
    country = 'USSR' # For india and others, country set later on
    datafields['GZCODE'][0] += 1
    datafields['VENT'][0] -= 1
    datafields['DEVICE'][0] -= 1
    if 'part 2' in filename or 'part 3' in filename:
        datafields['VENT'][0] += 3
    
    
 if 'France' in filename:
    datafields['LONG'][0] += 1
    datafields['LAT'][0] += 1
    datafields ['YIELD'][0] += 1
    datafields['PUR'][0] += 1
    datafields['YD-MX'][0] -= 1
    datafields['GZALT'][0] += 2
    datafields['HOB'][0] += 2
    datafields['TYPE'][0] += 1
    datafields['GZCODE'][0] += 1
    country = 'France'
    
 if 'China' in filename:
    datafields['YD-MX'][0] += 1
    datafields['YD-MN'][0] += 1
    datafields['VENT'][0] -= 1
    datafields['DEVICE'][0] -= 1
    country = 'China'

 if 'Kingdom' in filename:
    country = 'UK'
    
 if 'India' in filename:
    datafields['VENT'][0] -= 1
    # Country being set from data file
    

    


 readdata = False
 headerprinted = False
 lineno = 1
 outputfile = open(outfile,'w')
 errorfile = open(errfilename,'w')
 inputfile = open(filename, 'r', encoding='cp1252')
 for line in inputfile:
    line = line.strip('\n')
    line = line.replace('&gt;','>')
    line = line.replace('&lt;','<')
    
    if readdata:
        lineno += 1
        print(f'--->{lineno}')
        if line.startswith('***'):
            line = line.replace('*',' ')
            line = line.strip(' ')
            country = line
            
            continue
        if '</pre>' in line:
            readdata = False
            continue
        try:
            int(line[0:5])
        except:
            print(line)
            continue
        rec = {'line':line.replace(',',';')}
        start = 0
        for field in datafields:
            
            end = datafields[field][0]
            if end > 0:
                rec[field] = line[start:end].strip()
            else:
                rec[field] = line[start:].strip()
            if field == 'MAG' and rec[field] == '4.80C':
                rec[field] = '4.80' # Pretty sure this is an error in the Pakistan data set, nothing like this anywhere else
            start = end
            if rec[field] == '':
                rec[field] = None
            else:
                try:
                    if not datafields[field][1] == str and rec[field][0] in ('<','>'):
                       rec[field+'sign'] =rec[field][:1]
                       rec[field] = datafields[field][1](rec[field][1:])
                    else:
                       rec[field] = datafields[field][1](rec[field])
                    
                except ValueError as e:
                    print(e)
                    print(line)
                    print(field)
                    sys.exit(3)
                    
            # Convert to correct type
        if rec['ID']==None:
            continue
        try:
            mndnum = months.index(rec['MON'])
        except ValueError:
            errorfile.write(line)
            errorfile.write('\n')
            continue
        mndnum = str(mndnum).zfill(2)
        rec['COUNTRY'] = country
        rec['TIMESTAMP'] = f'{rec['YEAR']}{mndnum}{rec['DAY']}T{rec['TIME']}'
        print(rec)
        if breakat and lineno > linebreak:
            pass
            sys.exit()
        data=[]
        for key in rec:
            data.append(str(rec[key]))
        print(lineno)
        if not headerprinted:
            outputfile.write('\t'.join(rec.keys()))
            outputfile.write('\n')
            headerprinted = True
        outputfile.write('\t'.join(data))
        outputfile.write('\n')
        
    else:
        readdata = True if '<pre>' in line else readdata
        print(line)
        continue
 errorfile.close()
 outputfile.close()
	import sys
	from html.parser import HTMLParser

	"""Download the specific lists from https://www.johnstonsarchive.net/nuclear/tests/index.html.
	Save them with the same file name as the header (e.g.
	"Database of nuclear tests, United States: part 1, 1945-1963")
	run the script with the filename as argument and it will make a tab delimited text file."""

	filename=sys.argv[1]
	outfile = filename+'.dat'
	errfilename = filename + '.err'
	#with open(filename) as inputfile:
	# lines = inputfile.readlines()

	months = ['','JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']

	datafields = {'ID':[5,int],'SERIES':[23,str],'SHOT':[48,str],'YEAR':[52,int],
	'MON':[57,str], 'DAY':[62,int], 'TIME':[76,str], 'SITE':[85,str],
	'LAT':[94,float], 'LONG':[106,float], 'LOCCODE':[111,str], 'HOLE' :[124,str],
	'HOB':[130,float], 'GZALT':[135,int], 'GZCODE':[139,str], 'TYPE':[146,str],
	'PUR':[152,str], 'YD-MN':[157,float], 'YD-MX':[163,float], 'YD-EST':[168,str],
	'NT':[174,str], 'YIELD':[180,str], 'MAG': [187,float], 'CRAT':[192,str],
	'VENT': [198,str], 'DEVICE': [204,str], 'WARHEAD':[214,str],
	'SPONSOR':[220,str], 'REPNOTES':[223,str], 'SOURCES':[-1,str]}

	breakat = False
	linebreak = 350

	country = 'USA'

	if "United" in filename:
	datafields['SITE'][0] -= 2
	datafields['SHOT'][0] -= 2
	datafields['HOLE'][0] -= 4
	datafields['PUR'][0] -= 1
	datafields['NT'][0] -= 2
	datafields['VENT'][0] += 5

	if 'USSR' in filename or 'India' in filename:
	datafields['YD-MN'][0] += 1
	country = 'USSR' # For india and others, country set later on
	datafields['GZCODE'][0] += 1
	datafields['VENT'][0] -= 1
	datafields['DEVICE'][0] -= 1
	if 'part 2' in filename or 'part 3' in filename:
	datafields['VENT'][0] += 3


	if 'France' in filename:
	datafields['LONG'][0] += 1
	datafields['LAT'][0] += 1
	datafields ['YIELD'][0] += 1
	datafields['PUR'][0] += 1
	datafields['YD-MX'][0] -= 1
	datafields['GZALT'][0] += 2
	datafields['HOB'][0] += 2
	datafields['TYPE'][0] += 1
	datafields['GZCODE'][0] += 1
	country = 'France'

	if 'China' in filename:
	datafields['YD-MX'][0] += 1
	datafields['YD-MN'][0] += 1
	datafields['VENT'][0] -= 1
	datafields['DEVICE'][0] -= 1
	country = 'China'

	if 'Kingdom' in filename:
	country = 'UK'

	if 'India' in filename:
	datafields['VENT'][0] -= 1
	# Country being set from data file





	readdata = False
	headerprinted = False
	lineno = 1
	outputfile = open(outfile,'w')
	errorfile = open(errfilename,'w')
	inputfile = open(filename, 'r', encoding='cp1252')
	for line in inputfile:
	line = line.strip('\n')
	line = line.replace('>','>')
	line = line.replace('<','<')

	if readdata:
	lineno += 1
	print(f'--->{lineno}')
	if line.startswith('***'):
	line = line.replace('*',' ')
	line = line.strip(' ')
	country = line

	continue
	if '</pre>' in line:
	readdata = False
	continue
	try:
	int(line[0:5])
	except:
	print(line)
	continue
	rec = {'line':line.replace(',',';')}
	start = 0
	for field in datafields:

	end = datafields[field][0]
	if end > 0:
	rec[field] = line[start:end].strip()
	else:
	rec[field] = line[start:].strip()
	if field == 'MAG' and rec[field] == '4.80C':
	rec[field] = '4.80' # Pretty sure this is an error in the Pakistan data set, nothing like this anywhere else
	start = end
	if rec[field] == '':
	rec[field] = None
	else:
	try:
	if not datafields[field][1] == str and rec[field][0] in ('<','>'):
	rec[field+'sign'] =rec[field][:1]
	rec[field] = datafields[field][1](rec[field][1:])
	else:
	rec[field] = datafields[field][1](rec[field])

	except ValueError as e:
	print(e)
	print(line)
	print(field)
	sys.exit(3)

	# Convert to correct type
	if rec['ID']==None:
	continue
	try:
	mndnum = months.index(rec['MON'])
	except ValueError:
	errorfile.write(line)
	errorfile.write('\n')
	continue
	mndnum = str(mndnum).zfill(2)
	rec['COUNTRY'] = country
	rec['TIMESTAMP'] = f'{rec['YEAR']}{mndnum}{rec['DAY']}T{rec['TIME']}'
	print(rec)
	if breakat and lineno > linebreak:
	pass
	sys.exit()
	data=[]
	for key in rec:
	data.append(str(rec[key]))
	print(lineno)
	if not headerprinted:
	outputfile.write('\t'.join(rec.keys()))
	outputfile.write('\n')
	headerprinted = True
	outputfile.write('\t'.join(data))
	outputfile.write('\n')

	else:
	readdata = True if '<pre>' in line else readdata
	print(line)
	continue
	errorfile.close()
	outputfile.close()