'''
   Sample keywords from the histogram of search terms to generate the log-log plot to see Zipf's distribution
'''
__author__ = 'Dilum Bandara'
__version__ = "0.1"
__date__ = "07/29/2010"
__license__ = 'Python'
__copyright__ = "Copyright (c) 2010 Dilum Bandara"


inFile = 'histogram_keywords_corrected.txt'
outFile = 'keyword_samples.txt'

#opem files
fIn = open(inFile, 'r')
fOut = open(outFile, 'w')
count = 0

for line in fIn:
    #Extract first few lines as it is
    if count <= 1000:
        fOut.write(str(count + 1) + '\t' + line)
    #Sample only every other line
    elif count <= 1000 :
        if count % 2 == 0 : 
            fOut.write(str(count + 1) + '\t' + line)
    #Sample every 10th line
    elif count <= 10000 :
        if count % 10 == 0 : 
            fOut.write(str(count + 1) + '\t' + line)
    elif count <= 100000 :
        if count % 100 == 0 : 
            fOut.write(str(count + 1) + '\t' + line)
    elif count <= 1000000 :
        if count % 1000 == 0 : 
            fOut.write(str(count + 1) + '\t' + line)
    elif count <= 10000000 :
        if count % 10000 == 0 : 
            fOut.write(str(count + 1) + '\t' + line)
    count = count + 1
print count

#Close files
fIn.close()
fOut.close()
