'''Extrat keywords from the data file & put them into to a histogram based on common search 
   terms and time that searches were issues
'''

__author__ = 'Dilum Bandara'
__version__ = "0.1"
__date__ = "07/22/2010"
__license__ = 'Python'
__copyright__ = "Copyright (c) 2010 Dilum Bandara"

from operator import itemgetter
#import matplotlib.pyplot as plt
import numpy as np

inFile = 'dataset1.txt'
outFile1 = 'histogram_keywords.txt'
outFile2 = 'histogram_time.txt'
fIn = open(inFile, 'r')

keywords = []
keywordGroups = {}
times = []
count = 0

for line in fIn:
    count = count + 1
    #Print to see progress
    if count % 1000 == 0:
        print count
    #Extract keyword & time
    tmp = line.split('\t')
    keywords.append(tmp[0])
    times.append(int(tmp[1]))
    #Increment counter if keyword already found
    if tmp[0] in keywordGroups:
        keywordGroups[tmp[0]] = keywordGroups[tmp[0]] + 1
    else:
        keywordGroups[tmp[0]] = 1
print "Total keywords " + str(count)
fIn.close()

#Dump keyword histogram
fOut1 = open(outFile1, 'w')
y = []
sortedKeywords = sorted(keywordGroups.items(), key=itemgetter(1), reverse=True)
for i in sortedKeywords:
    fOut1.write(str(i[0]) + '\t' + str(i[1]) + '\n')
#    y.append(i[1])
fOut1.close()

#x = np.arange(0, 77, 1)
#plt.bar(x, y)

#Uncomment following to plot or dump that rate of arrival of search queries
#n, bins, patches = plt.hist(times, 448)
#plt.show()

#fOut2 = open(outFile2, 'w')
#for j in bins:
#    fOut2.write(str(j) + '\n')
#fOut2.close()