'''Given a search term, generate the histogram of query rate for each hour
'''

__author__ = 'Dilum Bandara'
__version__ = "0.1"
__date__ = "07/23/2010"
__license__ = 'Python'
__copyright__ = "Copyright (c) 2010 Dilum Bandara"

from operator import itemgetter
#import matplotlib.pyplot as plt
import numpy as np

inFile = 'dataset1.txt'
outFile = 'histogram_time_keyword.txt'
#Search term that you want to extract the histogram for
keyword = 'karate kid'
#No of buckets for entire dataset. We have 445 hours
noBuckets = 445
#Number of seconds per bucket
bucketSize = 3600

fIn = open(inFile, 'r')

keywords = []
keywordGroups = {}
times = []
count = 0
#Set mintime manually or set to very high value
minTime = 1276019940
maxTime = 0

for line in fIn:
    count = count + 1
    #Indicate progress
    if count % 10000 == 0:
        print count
    tmp = line.split('\t')
    keywords.append(tmp[0])
    #TODO this could match any seacrh term with given string. Update the code to correct it
    if keyword in tmp[0]:
       times.append(int(tmp[1]))
    if int(tmp[1]) < minTime:
        minTime = int(tmp[1])
    if int(tmp[1]) > maxTime:
       maxTime = int(tmp[1])
    if tmp[0] in keywordGroups:
        keywordGroups[tmp[0]] = keywordGroups[tmp[0]] + 1
    else:
        keywordGroups[tmp[0]] = 1

print 'Total keywords ' + str(count)
print 'MIN time ' + str(minTime)
print 'MAX time ' + str(maxTime)
fIn.close()

fOut = open(outFile, 'w')
buckets = [0]*noBuckets
for i in range(len(times)):
    x = times[i] - minTime
    index = x/bucketSize
    buckets[index] = buckets[index] + 1

for j in range(noBuckets):
    fOut.write(str(j) + '\t' + str(buckets[j]) + '\n')
fOut.close()
#n, bins, patches = plt.hist(times, 10)
#plt.show()

