'''
Created on Sep 1, 2011
@author: Dilum Bandara
@contact: dilumb@engr.colostate.edu
@version: 0.2
@copyright: Copyright(c) 2011, Dilum Bandara
@license: GPL
'''

'''
Extract queries from already formatted SWORD log file then build the
probabilistic finite state machine. Use attribute range as well
if not alreasdy formatted use formatLog.py to format
'''

import BaseConfig

fIn = 'allQueries.txt'
#fIn = 'log.dilum_3_step_2.txt'
fOut = 'States.txt'

fdIn = open(fIn, 'r')
states = {'START':{}} #{start state: {next state: num of queries}}
attbs = []
numLine = 0
new_query = False
for line in fdIn:
    line = line.lower()
    lineFmt = line.split()
    numLine += 1
    if lineFmt[0] == '<query>':
        if new_query == True: print 'New query inside another. See line ', numLine  
        attbs = []
        new_query = True
    elif lineFmt[0] == '</query>':
        if new_query == False: print 'Old query inside another. See line ', numLine
        new_query = False
        attbs.sort() #Sort attribute list
        for i in range(len(attbs)):           
            if i == 0: #If START to first state       
                if attbs[i] in states['START']: states['START'][attbs[i]] += 1
                else: states['START'][attbs[i]] = 1
                                
            if i + 1 == len(attbs):  #If last state to FINISH
                if attbs[i] in states:
                    if 'FINISH' in states[attbs[i]]: states[attbs[i]]['FINISH'] += 1                   
                    else: states[attbs[i]]['FINISH'] = 1
                else: states[attbs[i]] = {'FINISH': 1}
                    
            if len(attbs) > 1 and (i + 1) != len(attbs): #If 1+ intermediate states & not last
                if attbs[i] in states:
                    if attbs[i + 1] in states[attbs[i]]: states[attbs[i]][attbs[i + 1]] += 1
                    else: states[attbs[i]][attbs[i + 1]] = 1
                else: states[attbs[i]] = {attbs[i + 1]: 1}
    elif lineFmt[0] == '<request>' or lineFmt[0] == '</request>' or lineFmt[0] == '<group>' \
        or lineFmt[0] == '</group>' or lineFmt[0] == '</location>' or lineFmt[0] == '</os_name>':
        pass
    elif '<name>' in line or '<cost>' in line or '<success>' in line or '<answered>' in line \
         or '<location>' in line or '<os_name>' in line:
        pass
    elif '<num_machines>' in line :
        min = line.index('>')
        max = line.index('</')
        tmpLine = line[min + 1: max]
        tmpLine = tmpLine.replace(' ', '')
        attbs.append('NumRes_' + tmpLine) 
    elif '<fiveminload>' in line or '<oneminload>' in line or '<load_one>' in line:
        min = line.index('>')
        max = line.index('</')
        tmpLine = line[min + 1: max]
        tmpLine = tmpLine.replace(' ', '').split(',')
        if float(tmpLine[1]) < float(tmpLine[0]): print 'ERROR', numLine, line
        if float(tmpLine[3]) < float(tmpLine[2]): print 'ERROR', numLine, line
        if float(tmpLine[3]) < float(tmpLine[0]): print 'ERROR', numLine, line
        if '<fiveminload>' in line:
            if float(tmpLine[0]) < float(BaseConfig.MIN_5_MIN_LOAD): tmpLine[0] = BaseConfig.MIN_5_MIN_LOAD
            if float(tmpLine[3]) > float(BaseConfig.MAX_5_MIN_LOAD): tmpLine[3] = BaseConfig.MAX_5_MIN_LOAD
            attbs.append('5mLd_' + str(round(float(tmpLine[0]), 1)) + '_' + str(round(float(tmpLine[3]), 1))) 
        else:
            if float(tmpLine[0]) < float(BaseConfig.MIN_1_MIN_LOAD): tmpLine[0] = BaseConfig.MIN_1_MIN_LOAD
            if float(tmpLine[3]) > float(BaseConfig.MAX_1_MIN_LOAD): tmpLine[3] = BaseConfig.MAX_1_MIN_LOAD
            attbs.append('1mLd_' + str(round(float(tmpLine[0]), 1)) + '_' + str(round(float(tmpLine[3]), 1))) 
    elif '<memsize>' in line or '<memact>' in line: 
        min = line.index('>')
        max = line.index('</')
        tmpLine = line[min + 1: max]
        tmpLine = tmpLine.replace(' ', '').split(',')
        if float(tmpLine[1]) < float(tmpLine[0]): print 'ERROR', numLine, line
        if float(tmpLine[3]) < float(tmpLine[2]): print 'ERROR', numLine, line
        if float(tmpLine[3]) < float(tmpLine[0]): print 'ERROR', numLine, line
        if float(tmpLine[0]) < float(BaseConfig.MIN_MEM_SIZE): tmpLine[0] = BaseConfig.MAX_MIN_SIZE
        if float(tmpLine[3]) > float(BaseConfig.MAX_MEM_SIZE): tmpLine[3] = BaseConfig.MAX_MEM_SIZE
        attbs.append('MSize_' + str(round(float(tmpLine[0]), 1)) + '_' + str(round(float(tmpLine[3]), 1))) 
    elif '<freemem>' in line or '<free_mem>' in line or '<mem_free>' in line or '<freecpu>' in line \
         or '<numofcores>' in line or '<bootstate>' in line or '<kernver>' in line:
        min = line.index('>')
        max = line.index('</')
        tmpLine = line[min + 1: max]
        tmpLine = tmpLine.replace(' ', '').split(',')
        if tmpLine[2] == 'max' or tmpLine[3] == 'max':
            if '<mem_free>' in line or 'freemem' in line: 
                tmpLine[2] = BaseConfig.MAX_MEM_FREE
                tmpLine[3] = BaseConfig.MAX_MEM_FREE
        if float(tmpLine[1]) < float(tmpLine[0]): print 'ERROR', numLine, line
        if float(tmpLine[3]) < float(tmpLine[2]): print 'ERROR', numLine, line
        if float(tmpLine[3]) < float(tmpLine[0]): print 'ERROR', numLine, line
        if '<freecpu>' in line:
            if float(tmpLine[0]) < float(BaseConfig.MIN_CPU_FREE): tmpLine[0] = BaseConfig.MIN_CPU_FREE
            if float(tmpLine[3]) > float(BaseConfig.MAX_CPU_FREE): tmpLine[3] = BaseConfig.MAX_CPU_FREE
            attbs.append('CFree_' + str(round(float(tmpLine[0]), 0)) + '_' + str(round(float(tmpLine[3]), 0))) 
        elif '<numofcores>' in line:
            if float(tmpLine[0]) < float(BaseConfig.MIN_NUM_CORES): tmpLine[0] = BaseConfig.MIN_NUM_CORES
            if float(tmpLine[3]) > float(BaseConfig.MAX_NUM_CORES): tmpLine[3] = BaseConfig.MAX_NUM_CORES
            attbs.append('NCore_' + str(round(float(tmpLine[0]), 0)) + '_' + str(round(float(tmpLine[3]), 0))) 
        elif '<bootstate>' in line:
            if float(tmpLine[0]) < float(BaseConfig.MIN_BOOT): tmpLine[0] = BaseConfig.MIN_BOOT
            if float(tmpLine[3]) > float(BaseConfig.MAX_BOOT): tmpLine[3] = BaseConfig.MAX_BOOT
            attbs.append('Boot_' + str(round(float(tmpLine[0]), 0)) + '_' + str(round(float(tmpLine[3]), 0))) 
        elif '<kernver>' in line:
            if float(tmpLine[0]) < float(BaseConfig.MIN_KERN_VER): tmpLine[0] = BaseConfig.MIN_KERN_VER
            if float(tmpLine[3]) > float(BaseConfig.MAX_KERN_VER): tmpLine[3] = BaseConfig.MAX_KERN_VER
            attbs.append('KernVer_' + str(round(float(tmpLine[0]), 0)) + '_' + str(round(float(tmpLine[3]), 0))) 
        else:
            if float(tmpLine[0]) < float(BaseConfig.MIN_MEM_FREE): tmpLine[0] = BaseConfig.MIN_MEM_FREE
            if float(tmpLine[3]) > float(BaseConfig.MAX_MEM_FREE): tmpLine[3] = BaseConfig.MAX_MEM_FREE
            attbs.append('MFree_' + str(round(float(tmpLine[0]), 0)) + '_' + str(round(float(tmpLine[3]), 0))) 
    elif '<disksize>' in line or '<gbfree>' in line or '<disk_free>' in line or '<uptime>' in line:
        min = line.index('>')
        max = line.index('</')
        tmpLine = line[min + 1: max]
        tmpLine = tmpLine.replace(' ', '').split(',')
        if tmpLine[2] == 'max' or tmpLine[3] == 'max':
            if '<uptime>' in line: 
                tmpLine[2] = BaseConfig.MAX_UPTIME
                tmpLine[3] = BaseConfig.MAX_UPTIME
            elif '<disksize>' in line: 
                tmpLine[2] = BaseConfig.MAX_DISK_SIZE
                tmpLine[3] = BaseConfig.MAX_DISK_SIZE
            elif '<diskfree>' in line or '<disk_free>' in line: 
                tmpLine[2] = BaseConfig.MAX_DISK_FREE
                tmpLine[3] = BaseConfig.MAX_DISK_FREE
        if float(tmpLine[1]) < float(tmpLine[0]): print 'ERROR', numLine, line
        if float(tmpLine[3]) < float(tmpLine[2]): print 'ERROR', numLine, line
        if float(tmpLine[3]) < float(tmpLine[0]): print 'ERROR', numLine, line
        if '<disksize>' in line:
            if float(tmpLine[0]) < float(BaseConfig.MIN_DISK_SIZE): tmpLine[0] = BaseConfig.MIN_DISK_SIZE
            if float(tmpLine[3]) > float(BaseConfig.MAX_DISK_SIZE): tmpLine[3] = BaseConfig.MAX_DISK_SIZE
            attbs.append('DSize_' + str(round(float(tmpLine[0]), -1)) + '_' + str(round(float(tmpLine[3]), -1))) 
        elif '<uptime>' in line:
            if float(tmpLine[0]) < float(BaseConfig.MIN_UPTIME): tmpLine[0] = BaseConfig.MIN_UPTIME
            if float(tmpLine[3]) > float(BaseConfig.MAX_UPTIME): tmpLine[3] = BaseConfig.MAX_UPTIME
            attbs.append('upT_' + str(round(float(tmpLine[0]), -1)) + '_' + str(round(float(tmpLine[3]), -1))) 
        else:
            if float(tmpLine[0]) < float(BaseConfig.MIN_DISK_FREE): tmpLine[0] = BaseConfig.MIN_DISK_FREE
            if float(tmpLine[3]) > float(BaseConfig.MAX_DISK_FREE): tmpLine[3] = BaseConfig.MAX_DISK_FREE
            attbs.append('DFree_' + str(round(float(tmpLine[0]), -1)) + '_' + str(round(float(tmpLine[3]), -1))) 
    elif '<cpuspeed>' in line or '<drift>' in line or '<resptime>' in line or '<latency>' in line:
        min = line.index('>')
        max = line.index('</')
        tmpLine = line[min + 1: max]
        tmpLine = tmpLine.replace(' ', '').split(',')
        if float(tmpLine[1]) < float(tmpLine[0]): print 'ERROR', numLine, line
        if float(tmpLine[3]) < float(tmpLine[2]): print 'ERROR', numLine, line
        if float(tmpLine[3]) < float(tmpLine[0]): print 'ERROR', numLine, line
        if '<cpuspeed>' in line:
            if float(tmpLine[0]) < float(BaseConfig.MIN_CPU_SPEED): tmpLine[0] = BaseConfig.MIN_CPU_SPEED
            if float(tmpLine[3]) > float(BaseConfig.MAX_CPU_SPEED): tmpLine[3] = BaseConfig.MAX_CPU_SPEED
            attbs.append('CSp_' + str(round(float(tmpLine[0]), 1)) + '_' + str(round(float(tmpLine[3]), 1))) 
        elif '<drift>' in line:
            if float(tmpLine[0]) < float(BaseConfig.MIN_DRIFT): tmpLine[0] = BaseConfig.MIN_DRIFT
            if float(tmpLine[3]) > float(BaseConfig.MAX_DRIFT): tmpLine[3] = BaseConfig.MAX_DRIFT
            attbs.append('Drift_' + str(round(float(tmpLine[0]), 1)) + '_' + str(round(float(tmpLine[3]), 1))) 
        elif '<latency>' in line:
            if float(tmpLine[0]) < float(BaseConfig.MIN_TIMER_AVE): tmpLine[0] = BaseConfig.MIN_TIMER_AVE
            if float(tmpLine[3]) > float(BaseConfig.MAX_TIMER_AVE): tmpLine[3] = BaseConfig.MAX_TIMER_AVE
            attbs.append('TAve_' + str(round(float(tmpLine[0]), 1)) + '_' + str(round(float(tmpLine[3]), 1))) 
        else:
            if float(tmpLine[0]) < float(BaseConfig.MIN_RESP_TIME): tmpLine[0] = BaseConfig.MIN_RESP_TIME
            if float(tmpLine[3]) > float(BaseConfig.MAX_RESP_TIME): tmpLine[3] = BaseConfig.MAX_RESP_TIME
            attbs.append('RespT_' + str(round(float(tmpLine[0]), 1)) + '_' + str(round(float(tmpLine[3]), 1))) 
    elif '<txrate>' in line or '<rxrate>' in line or '<bwlimit>' in line:
        min = line.index('>')
        max = line.index('</')
        tmpLine = line[min + 1: max]
        tmpLine = tmpLine.replace(' ', '').split(',')
        if tmpLine[2] == 'max' or tmpLine[3] == 'max':
            if '<bwlimit>' in line: 
                tmpLine[2] = BaseConfig.MAX_BW_LIMIT
                tmpLine[3] = BaseConfig.MAX_BW_LIMIT
        if float(tmpLine[1]) < float(tmpLine[0]): print 'ERROR', numLine, line
        if float(tmpLine[3]) < float(tmpLine[2]): print 'ERROR', numLine, line
        if float(tmpLine[3]) < float(tmpLine[0]): print 'ERROR', numLine, line
        if '<txrate>' in line:
            if float(tmpLine[0]) < float(BaseConfig.MIN_TX_RATE): tmpLine[0] = BaseConfig.MIN_TX_RATE
            if float(tmpLine[3]) > float(BaseConfig.MAX_TX_RATE): tmpLine[3] = BaseConfig.MAX_TX_RATE
            attbs.append('Tx_' + str(round(float(tmpLine[0]), -1)) + '_' + str(round(float(tmpLine[3]), -1))) 
        elif '<rxrate>' in line:
            if float(tmpLine[0]) < float(BaseConfig.MIN_RX_RATE): tmpLine[0] = BaseConfig.MIN_RX_RATE
            if float(tmpLine[3]) > float(BaseConfig.MAX_RX_RATE): tmpLine[3] = BaseConfig.MAX_RX_RATE
            attbs.append('Rx_' + str(round(float(tmpLine[0]), -1)) + '_' + str(round(float(tmpLine[3]), -1))) 
        else:
            if float(tmpLine[0]) < float(BaseConfig.MIN_BW_LIMIT): tmpLine[0] = BaseConfig.MIN_BW_LIMIT
            if float(tmpLine[3]) > float(BaseConfig.MAX_BW_LIMIT): tmpLine[3] = BaseConfig.MAX_BW_LIMIT
            attbs.append('BWLim_' + str(round(float(tmpLine[0]), -1)) + '_' + str(round(float(tmpLine[3]), -1))) 
    elif 'northam' in line or 'northamerica' in line or 'southam' in line or 'europe' in line or 'asia' in line: 
        min = line.index('>')
        max = line.index('</')
        tmpLine = line[min + 1: max]
        tmpLine = tmpLine.replace(' ', '').replace('\'', '').split(',')
        if tmpLine[0] == 'northam' or tmpLine[0] == 'northamerica': attbs.append('Loc_1') 
        elif tmpLine[0] == 'southam': attbs.append('Loc_2') 
        elif tmpLine[0] == 'europe': attbs.append('Loc_3') 
        elif tmpLine[0] == 'asia': attbs.append('Loc_4') 
        else: print line
    elif '<value>linux' in line: 
        min = line.index('>')
        max = line.index('</')
        tmpLine = line[min + 1: max]
        tmpLine = tmpLine.replace(' ', '').split(',')
        if float(tmpLine[1]) < BaseConfig.MIN_FC_NAMEX : tmpLine[1] = BaseConfig.MIN_FC_NAMEX
        if float(tmpLine[1]) > BaseConfig.MAX_FC_NAMEX : tmpLine[1] = BaseConfig.MAX_FC_NAMEX
        attbs.append('FCNamex_' + str(round(float(tmpLine[1]), 0))) 
    elif '<noattribute>' in line: attbs.append('NoAtt')
    else: print 'UNKNOWN LINE:', numLine, line #Unknown lineFmt 

fdIn.close()

#Dump states
fdOut = open(fOut, 'w')
for i in states:
    for j in states[i]:
        fdOut.write(i + '\t' + j + '\t' + str(states[i][j]) + '\n')
fdOut.close()
