-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcsv_condense.py
More file actions
150 lines (133 loc) · 6.7 KB
/
csv_condense.py
File metadata and controls
150 lines (133 loc) · 6.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import csv
import os
import operator
import itertools
import io
from collections import defaultdict
strinputFile = "D:\\logs\\fileforanalysis.csv" #Input file or folder to process
strOutPath = "D:\\analysis\\output.csv" #output file
boolTruncateAtChar = True #truncate key at first space
strTruncateChar = "\t"
boolIncludeCount = True
boolQuoteOutput = True #add quotes around each field
inputEncoding = "utf-8"
outputEncoding = "utf-8"
intNumericAddColumn = 9; #Column containing numeric value to add up. Set to -1 to disable.
boolHasHeader = True #if no header in CSV then set to false and provide integer values for dictHeader
boolIgnoreCountErrror = True #skip counting entries where intNumericAddColumn is not numeric
maxInt = 100000000 #csv.field_size_limit
dictHeader = dict() #add the column header for each column that will be tracked
dictNumeric = dict()
# the following are example column headers. Remove the # to use or add your own
# if your CSV has no header row then set the dictionary header to equal the column number. Example: dictHeader["Dst VM"] = 1;
#dictHeader["Dst VM"] = "";
#dictHeader["Src VM"] = "";
#dictHeader["Dst IP"] = "";
#dictHeader["IP Address"] = "";
#dictHeader["Remote Port"] = "";
#dictHeader["Domain"] = "";
#dictHeader["Success"] = "";
#Kansa scheduled task
#dictHeader["TaskName"] = ""
#dictHeader["Task To Run"] = ""
#Cb Response Network Activity
#dictHeader["User Name"] = "";
#dictHeader["Sensor ID"] = "";
#dictHeader["IP Address"] = "";
#dictHeader["Domain"] = "";
#Cb Response Hash Dump
#dictHeader["Path"] = "";
#dictHeader["Publisher"] = "";
#dictHeader["Company"] = "";
#dictHeader["Product"] = "";
#dictHeader["Internal Name"] = "";
#dictHeader["Original File Name"] = "";
#Cb Response module activity
#dictHeader["MD5"] = "";
#dictHeader["File Path"] = "";
#dictHeader["Computer"] = "";
#dictHeader["Sensor ID"] = "";
def writeCSV(fHandle, rowOut):
if boolQuoteOutput == True:
fHandle.write("\"" + rowOut.replace("|", "\",\"") + "\"\n")
else:
fHandle.write(rowOut.replace("|", ",") + "\n")
dictOutput = dict()
listFilePaths = []
if os.path.isdir(strinputFile):
for (dirpath, dirnames, filenames) in os.walk(strinputFile):
for file in filenames:
scanPath = os.path.join(dirpath, file)
print(scanPath + "\n")
listFilePaths.append(scanPath)
elif os.path.isfile(strinputFile):
listFilePaths.append(strinputFile)
else:
print("Bad file path?")
for strinputFile in listFilePaths:
boolFirstRow = boolHasHeader;
csv.field_size_limit(maxInt)
intRowCount = 0
with open(strinputFile, "rt", encoding=inputEncoding) as csvfile: #, encoding="utf-16"
reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
keyitem = "";
for row in reader:
if boolFirstRow == True: #parse header row
boolFirstRow= False
if boolHasHeader == True:
for headItem in row:
if headItem in dictHeader:
dictHeader[headItem] = intRowCount;
intRowCount += 1;
for headItem in dictHeader:
if not isnumeric(dictHeader[headItem]):
print("Missing header value \"" + headItem + "\"")
else: #not blank row
keyitem = ""
for columnloc in sorted(dictHeader, key=dictHeader.get): #sort off value to keep column alignment consistent with source
if dictHeader[columnloc] != '': #if found in header
if keyitem == "":
if len(row) >= dictHeader[columnloc]:
keyitem = row[dictHeader[columnloc]];
if boolTruncateAtChar == True:
if keyitem.find(strTruncateChar) > -1:
keyitem = keyitem[:keyitem.find(strTruncateChar)]
else:
f2 = open(strinputFile + ".error", 'a+', encoding="utf-16")
f2.write("".join(row) + "\n") #write header row
elif len(row) <= dictHeader[columnloc]: #Error! can't grab value
print("Row not as long as header: " + str(len(row)) + "<=" + str(dictHeader[columnloc]) + "\n" + " ".join(row))
print(row)
else: #aggregate key names
keyitem = keyitem + "|" + row[dictHeader[columnloc]];
if intNumericAddColumn > -1 and intNumericAddColumn < len(row): #intNumericAddColumn has a value and is less than the row count
if row[intNumericAddColumn].isnumeric(): #Column we are counting has a numeric value
intNumeric = int(row[intNumericAddColumn])
if keyitem in dictNumeric: #aggregate value
dictNumeric[keyitem] = int(dictNumeric[keyitem]) + int(intNumeric) #math
else: #new addition to dict
dictNumeric[keyitem] = int(intNumeric)
else: #non-numeric value for intNumericAddColumn
print("Error! intNumericAddColumn is Not numeric: " + row[intNumericAddColumn] + "\n" + keyitem + "\n" + str(intNumericAddColumn) + "|" + str(len(row)) + "\n" + " ".join(row))
elif intNumericAddColumn > len(row) -1 and boolIgnoreCountErrror == True:
print("ingoring row: " + " ".join(row) + "\nfrom file " + strinputFile + "\non line " + str(intRowCount))
elif intNumericAddColumn > len(row) -1:
print("Error: intNumericAddColumn is greather than the CSV column count. \n" + str(intNumericAddColumn) + ">=" + str(len(row)) + "Disabling numeric add function\n" + strinputFile + "\n" + " ".join(row))
intNumericAddColumn = -1
elif intNumericAddColumn != -1 and keyitem not in dictNumeric: #should only hit here if something is wrong
print("not sure how we got here. Check your input file(s) and config? " + keyitem + "\n" + str(intNumericAddColumn) + "|" + str(len(row)) + "\n" + " ".join(row))
if keyitem in dictOutput:
dictOutput[keyitem] += 1;
else:
dictOutput[keyitem] = 1;
intRowCount +=1
with io.open(strOutPath, "w", encoding=outputEncoding) as f: #encoding="utf-16"
for lineEntry in dictOutput:
if boolIncludeCount == True: #Count of entries combined
outputline = lineEntry + "|" + str(dictOutput[lineEntry])
else:
outputline = lineEntry
if intNumericAddColumn > -1: #Integer aggregation
strCount = "|" + str(dictNumeric[lineEntry])
outputline = outputline + strCount
writeCSV(f,outputline)