anonymize_csv.py 4.74 KB
Newer Older
Mohammad Imran Syed's avatar
Mohammad Imran Syed committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""

@Author : Mohammad Imran SYED (mohammad-imran.syed@lip6.fr)

Script for extracting relevant information from .pcap files to save it in .txt files

And also anonymize the traces (sender MAC addresses) and save them in AnonymizedTraces/ folder

"""

import time
import hashlib
import subprocess
import os
import pandas as pd
import numpy as np
import re

class PCAP():
    
    def parsing(self, file):
        
        csvtraces = self.directory + 'CSVTraces/'
        if not os.path.exists(csvtraces):
            os.makedirs(csvtraces)
        os.chmod(csvtraces, 0o777)
        
        tSharkOutputFile = csvtraces + file[0:len(file)-5] + '.txt'
        tSharkOut = open(tSharkOutputFile, "wb")
        tSharkInputFile = self.directory + file
        tSharkCall = ['tshark', '-r', tSharkInputFile, '-Y', '!_ws.malformed and wlan_radio.channel==1', '-T', 'fields', '-E', 'header=y', '-E', 'separator=/t', '-e', 'frame.number', '-e', 'frame.time_epoch', '-e', 'frame.time_relative', '-e', 'wlan.fixed.timestamp', '-e', 'wlan_radio.signal_dbm', '-e', 'wlan_radio.channel', '-e', 'wlan.fc.type', '-e', 'wlan.fc.type_subtype', '-e', 'wlan.fc.retry', '-e', 'wlan.fcs', '-e', 'wlan.sa', '-e', 'wlan.seq', '-e', 'wlan.frag']
        
        tSharkProc = subprocess.Popen(tSharkCall, stdout=tSharkOut)
        
        tSharkProc.wait()
        
        tSharkOut.close()
        
        trace=pd.read_csv(tSharkOutputFile, delimiter='\t', squeeze=True)
        
        trace=trace.rename(columns={"frame.number":"Frame_number", "frame.time_epoch":"Frame_time_epoch", "frame.time_relative":"Frame_time_relative", "wlan.fixed.timestamp":"Fixed_timestamp", "wlan_radio.signal_dbm":"RSSI_dBm", "wlan_radio.channel":"Channel", "wlan.fc.type":"Frame_type", "wlan.fc.type_subtype":"Frame_subtype", "wlan.fc.retry":"Retransmission", "wlan.fcs":"Checksum", "wlan.sa":"Source_MAC_address", "wlan.seq":"Sequence_number", "wlan.frag":"Fragment_number"})

        trace['Source_MAC_address'] = trace['Source_MAC_address'].fillna(0)
        trace['Sequence_number'] = trace['Sequence_number'].fillna(0)

        trace = trace.astype({"RSSI_dBm":int, "Channel":int, "Sequence_number":int})
        trace.to_csv(tSharkOutputFile, index=False, header=True, sep='\t')    


        anontraces = self.directory + 'AnonymizedTraces/'
        if not os.path.exists(anontraces):
            os.makedirs(anontraces)
        os.chmod(anontraces, 0o777)
        
        c_size=1000
        AnonymizedOutputFile = file[0:len(file)-5] + '.txt'
        AnonymizedFilePath = anontraces + AnonymizedOutputFile
        if os.path.exists(AnonymizedFilePath):
            os.remove(AnonymizedFilePath)

        for trace in pd.read_csv(tSharkOutputFile, delimiter='\t', chunksize=c_size):
            #trace = pd.read_csv(tSharkOutputFile, delimiter='\t', squeeze=True)

            all_src_mac = trace['Source_MAC_address'].tolist()
            uniq_src_macs = np.unique(all_src_mac)         
            
            replacements_src = {}
            
            for mac in range(len(uniq_src_macs)):
                replacements_src[uniq_src_macs[mac]]=str(uniq_src_macs[mac]).encode()
                replacements_src[uniq_src_macs[mac]]=hashlib.sha512(replacements_src[uniq_src_macs[mac]]).hexdigest()
                replacements_src[uniq_src_macs[mac]] = replacements_src[uniq_src_macs[mac]][:64]
            
            src_mac_anon=list()
            
            for i in range(len(all_src_mac)):
                if all_src_mac[i] in replacements_src:
                    all_src_mac[i] = replacements_src[all_src_mac[i]]
                    src_mac_anon.append(all_src_mac[i])
    
            trace['Source_MAC_address'] = all_src_mac
            if not os.path.exists(AnonymizedFilePath):
                trace.to_csv(AnonymizedFilePath, index=False, header=True, sep='\t')
            else:
                trace.to_csv(AnonymizedFilePath, index=False, header=False, mode='a', sep='\t')
    

def main():

    start = time.time()

    anonymize_pcap = PCAP()
    
    anonymize_pcap.directory = 'Traces/' # the directory where you have saved the .pcap files

    files = os.listdir(anonymize_pcap.directory)
    files.sort()
    files = files[2:len(files)-4]
    files.sort(key=lambda f: int(re.sub('\D', '', f)))
            
    k=0
    while k<len(files):
        print("Anonymizing file : " + files[k])
        anonymize_pcap.parsing(files[k])
        k+=1
        print("Anonymized")
    
    end = time.time()
    
    print("ALL FILES HAVE BEEN ANONYMIZED")
    print("------------------------------------------------------------")
    print("Time taken by the program in seconds: ", end-start)
    print("------------------------------------------------------------")

if __name__ == '__main__':
    main()