devmacrile · February 5, 2015 16:29
diff --git a/map1.py b/map1.py
 #!/usr/bin/python
 import sys
 import re
 import nltk
 from nltk.corpus import stopwords

 stop_words = stopwords.words('english')
 #input comes from standard input
 for line in sys.stdin:
 	#separate incident id from text
 	id = line.split('\t', 1)[0]
 	incident = line.split('\t', 1)[1]
 	#split incident into words
 	words = incident.split()
 	for word in words:
 		word = word.lower()
 		pattern = re.compile('[\W_]+')
 		word = pattern.sub('', word)
 		#remove stop words, words that start with #s
 		if word in stop_words:
 			continue
 		if len(word) == 0:
 			continue
 		if word[0].isdigit():
 			continue
 		#write results to stdout
 		print('%s\t%s\t%s' %(word, id, 1))
diff --git a/map2.py b/map2.py
 #!/usr/bin/python
 import sys
 import re


 #input comes from standard input
 for line in sys.stdin:
 	#separate term, incident, and tf from red1 output
 	word, incident, tf = line.strip().split('\t', 3)
 	#print out tuple of form (term, (file, tf, 1))
 	print('%s\t%s\t%s' %(word, incident, tf))
diff --git a/map3.py b/map3.py
 #!/usr/bin/python

 import sys
 import math

 N = 668245.0 #hard coded for now
 #input comes from standard input
 for line in sys.stdin:
 	#separate term, incident, and tf from red1 output
 	#the strip is necessary to remove eol characters
 	word, incident, tf, df = line.strip().split('\t', 3)
 	tf = int(tf)
 	df = int(df)
 	tfidf = tf * math.log10(N/df)
 	#print out tuple of form (term, (file, tf, 1))
 	print('%s\t%s\t%s' %(word, incident, tfidf))
diff --git a/red1.py b/red1.py
 #!/usr/bin/python

 import sys

 current_word = None
 current_incident = None
 current_count = 0
 word = None

 #input comes from stdin
 for line in sys.stdin:
 	#remove leading and trailing whitespace
 	line = line.strip()

 	#parse input from mapper.py
 	word, incident, count = line.split('\t', 2)
 	
 	try:
 		count = int(count)
 	except ValueError:
 		#count not a number, discard line
 		continue
 		
 	if current_word == word and current_incident == incident:
 		current_count += count
 	else:
 		if current_word:
 			#write results to stdout
 			print('%s\t%s\t%s' % (current_word, current_incident, current_count))
 		current_count = count
 		current_word = word
 		current_incident = incident
 		
 #have to output last word
 if current_word == word:
 	print('%s\t%s\t%s' % (current_word, current_incident, current_count))


diff --git a/red2.py b/red2.py
 #!/usr/bin/python

 import sys

 current_word, current_incident, current_tf = sys.stdin.readline().strip().split('\t', 2)
 current_df = 1
 buffer = []
 for line in sys.stdin:
 	#parse input from map2.py
 	word, incident, tf = line.strip().split('\t', 2)
 	
 	if word == current_word:
 		current_df += 1
 		tobuff = ('%s\t%s\t%s' %(word, incident, tf))
 		buffer.append(tobuff)
 		
 	else:
 		for ln in buffer:
 			print('%s\t%s' %(ln, current_df))
 		print('%s\t%s\t%s\t%s' %(current_word, current_incident, current_tf, current_df))
 		buffer = []
 		current_word = word
 		current_incident = incident
 		current_tf = tf
 		current_df = 1

 lastdf = len(buffer) + 1			
 for ln in buffer:
 	print('%s\t%s' %(ln, lastdf))
 print('%s\t%s\t%s\t%s' % (current_word, current_incident, current_tf, lastdf))










diff --git a/red3.py b/red3.py
 #!/usr/bin/python

 import sys

 for line in sys.stdin:
 	line = line.strip()
 	print line
	#!/usr/bin/python
	import sys
	import re
	import nltk
	from nltk.corpus import stopwords

	stop_words = stopwords.words('english')
	#input comes from standard input
	for line in sys.stdin:
	#separate incident id from text
	id = line.split('\t', 1)[0]
	incident = line.split('\t', 1)[1]
	#split incident into words
	words = incident.split()
	for word in words:
	word = word.lower()
	pattern = re.compile('[\W_]+')
	word = pattern.sub('', word)
	#remove stop words, words that start with #s
	if word in stop_words:
	continue
	if len(word) == 0:
	continue
	if word[0].isdigit():
	continue
	#write results to stdout
	print('%s\t%s\t%s' %(word, id, 1))
	#!/usr/bin/python

	import sys
	import math

	N = 668245.0 #hard coded for now
	#input comes from standard input
	for line in sys.stdin:
	#separate term, incident, and tf from red1 output
	#the strip is necessary to remove eol characters
	word, incident, tf, df = line.strip().split('\t', 3)
	tf = int(tf)
	df = int(df)
	tfidf = tf * math.log10(N/df)
	#print out tuple of form (term, (file, tf, 1))
	print('%s\t%s\t%s' %(word, incident, tfidf))
	#!/usr/bin/python

	import sys

	current_word = None
	current_incident = None
	current_count = 0
	word = None

	#input comes from stdin
	for line in sys.stdin:
	#remove leading and trailing whitespace
	line = line.strip()

	#parse input from mapper.py
	word, incident, count = line.split('\t', 2)

	try:
	count = int(count)
	except ValueError:
	#count not a number, discard line
	continue

	if current_word == word and current_incident == incident:
	current_count += count
	else:
	if current_word:
	#write results to stdout
	print('%s\t%s\t%s' % (current_word, current_incident, current_count))
	current_count = count
	current_word = word
	current_incident = incident

	#have to output last word
	if current_word == word:
	print('%s\t%s\t%s' % (current_word, current_incident, current_count))
	#!/usr/bin/python

	import sys

	current_word, current_incident, current_tf = sys.stdin.readline().strip().split('\t', 2)
	current_df = 1
	buffer = []
	for line in sys.stdin:
	#parse input from map2.py
	word, incident, tf = line.strip().split('\t', 2)

	if word == current_word:
	current_df += 1
	tobuff = ('%s\t%s\t%s' %(word, incident, tf))
	buffer.append(tobuff)

	else:
	for ln in buffer:
	print('%s\t%s' %(ln, current_df))
	print('%s\t%s\t%s\t%s' %(current_word, current_incident, current_tf, current_df))
	buffer = []
	current_word = word
	current_incident = incident
	current_tf = tf
	current_df = 1

	lastdf = len(buffer) + 1
	for ln in buffer:
	print('%s\t%s' %(ln, lastdf))
	print('%s\t%s\t%s\t%s' % (current_word, current_incident, current_tf, lastdf))