diff --git a/LICENSE b/LICENSE index c4588f7..ab150ce 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ The MIT License (MIT) -Copyright (c) 2015 armbues +Original work: Copyright (c) 2015 armbues +Additional work: (c) Copyright 2016 Hewlett Packard Enterprise Development LP Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -18,5 +19,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index eb37c17..97bb12b 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,10 @@ IOC Parser is a tool to extract indicators of compromise from security reports i ## Usage **iocp.py [-h] [-p INI] [-i FORMAT] [-o FORMAT] [-d] [-l LIB] FILE** -* *FILE* File/directory path to report(s) +* *FILE* File/directory path to report(s)/Gmail account in double quotes ("username@gmail.com password") * *-p INI* Pattern file -* *-i FORMAT* Input format (pdf/txt/html) -* *-o FORMAT* Output format (csv/json/yara) +* *-i FORMAT* Input format (pdf/txt/html/csv/xls/xlsx/gmail) +* *-o FORMAT* Output format (csv/json/yara/netflow) * *-d* Deduplicate matches * *-l LIB* Parsing library @@ -19,4 +19,10 @@ For HTML parsing support: * [BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/) - *pip install beautifulsoup4* For HTTP(S) support: -* [requests](http://docs.python-requests.org/en/latest/) - *pip install requests* \ No newline at end of file +* [requests](http://docs.python-requests.org/en/latest/) - *pip install requests* + +For XLS/XLSX support: +* [xlrd](https://github.com/python-excel/xlrd) - *pip install xlrd* + +For Gmail support: +* [gmail](https://github.com/charlierguo/gmail) \ No newline at end of file diff --git a/iocp.py b/iocp.py index 9968cc2..b91fad1 100755 --- a/iocp.py +++ b/iocp.py @@ -40,6 +40,7 @@ import fnmatch import argparse import re +import csv from StringIO import StringIO try: import configparser as ConfigParser @@ -48,6 +49,18 @@ # Import optional third-party libraries IMPORTS = [] +try: + import xlrd + IMPORTS.append('xlrd') +except ImportError: + pass + +try: + import gmail + IMPORTS.append('gmail') +except ImportError: + pass + try: from PyPDF2 import PdfFileReader IMPORTS.append('pypdf2') @@ -81,7 +94,7 @@ class IOC_Parser(object): patterns = {} defang = {} - def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library='pdfminer', output_format='csv', output_handler=None): + def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library='pdfminer', output_format='csv', proxy=None, output_handler=None): basedir = os.path.dirname(os.path.abspath(__file__)) if patterns_ini is None: patterns_ini = os.path.join(basedir, 'patterns.ini') @@ -89,6 +102,16 @@ def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library=' self.load_patterns(patterns_ini) self.whitelist = WhiteList(basedir) self.dedup = dedup + + # Depending on the type of proxy, set the proper proxy setting for storage to be used with Requests + if proxy is not None: + if proxy.startswith('http://'): + self.proxy = {'http': proxy} + elif proxy.startswith('https://'): + self.proxy = {'https': proxy} + else: + self.proxy = proxy + if output_handler: self.handler = output_handler else: @@ -111,6 +134,14 @@ def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library=' if 'beautifulsoup' not in IMPORTS: e = 'HTML parser library not found: BeautifulSoup' raise ImportError(e) + elif input_format == 'xlsx': + if 'xlrd' not in IMPORTS: + e = 'XLRD Library not found. Please visit: https://github.com/python-excel/xlrd or pip install xlrd' + raise ImportError(e) + elif input_format == 'gmail': + if 'gmail' not in IMPORTS: + e = 'Gmail library not found. Please visit: https://github.com/charlierguo/gmail' + raise ImportError(e) def load_patterns(self, fpath): config = ConfigParser.ConfigParser() @@ -144,7 +175,20 @@ def is_whitelisted(self, ind_match, ind_type): pass return False - def parse_page(self, fpath, data, page_num): + def parse_page(self, fpath, data, page_num, flag=0, sheet_name=''): + """ Added flag and sheet_name variables for new inputs to help properly + print output + + @param fpath: the file path, directory, URL or email account + @param data: the data to be parsed + @param page_num: the page number of a pdf, line number of csv, xls or xlsx + @param flag: + 0 = default (pdf/txt/html) + 1 = gmail + 2 = csv + 3 = xls and xlsx + @param sheet_name: to be used only with Excel spreadsheets + """ for ind_type, ind_regex in self.patterns.items(): matches = ind_regex.findall(data) @@ -164,7 +208,8 @@ def parse_page(self, fpath, data, page_num): self.dedup_store.add((ind_type, ind_match)) - self.handler.print_match(fpath, page_num, ind_type, ind_match) + # Added flag and sheet_name to determine which type of output to display + self.handler.print_match(fpath, page_num, ind_type, ind_match, flag, sheet_name) def parse_pdf_pypdf2(self, f, fpath): try: @@ -246,7 +291,7 @@ def parse_html(self, f, fpath): self.dedup_store = set() data = f.read() - soup = BeautifulSoup(data) + soup = BeautifulSoup(data, 'html.parser') # Add "html.parser" to supress user warning html = soup.findAll(text=True) text = u'' @@ -266,6 +311,124 @@ def parse_html(self, f, fpath): except Exception as e: self.handler.print_error(fpath, e) + def parse_csv(self, f, fpath): + """ This method is used to parse a csv file. The flag + used for this method to send to output.py is 2. + + @author Robb Krasnow + """ + try: + if self.dedup: + self.dedup_store = set() + + self.handler.print_header(fpath) + + with open(fpath, 'rb') as csvfile: + csv_data = csv.reader(csvfile, delimiter=',', quotechar='|') + + for row in csv_data: + line = ', '.join(row).rstrip() + unicode_output = unicode(line, 'ascii', errors='ignore') + + self.parse_page(fpath, unicode_output, csv_data.line_num, 2) + + self.handler.print_footer(fpath) + except (KeyboardInterrupt, SystemExit): + raise + except Exception as e: + self.handler.print_error(fpath, e) + + + def parse_xls(self, f, fpath): + """ Created this function just to allow a user to use 'xls' as an input + option without any errors. + + @author Robb Krasnow + """ + self.parse_xlsx(f, fpath) + + + def parse_xlsx(self, f, fpath): + """ This method is used to parse Microsoft Excel files + with either .xls or .xlsx extentions. The flag + used for this method to send to output.py is 3. Because + Excel spreadsheets may have multiple tabs, the sheet's + name is passed through the parse_page method in turn showing + that in the output. + + @author Robb Krasnow + """ + try: + if self.dedup: + self.dedup_store = set() + + self.handler.print_header(fpath) + workbook = xlrd.open_workbook(fpath) + sheets = workbook.sheets() + + for sheet in sheets: + sheet_name = sheet.name + + for row in range(sheet.nrows): + for col in range(sheet.ncols): + if sheet.cell_value(row, col) is not xlrd.empty_cell.value: + val = repr(sheet.cell_value(row, col)) + + self.parse_page(fpath, val, row+1, 3, sheet_name) + + self.handler.print_footer(fpath) + except (KeyboardInterrupt, SystemExit): + raise + except Exception as e: + self.handler.print_error(fpath, e) + + + def parse_gmail(self, username, password): + """ This method is used to parse the inbox of a valid + Gmail account. The flag used for this method to send to + output.py is 1. + + @author Robb Krasnow + @param username The gmail account's username + @param password The gmail account's password + """ + try: + if self.dedup: + self.dedup_store = set() + + # Log the user in + g = gmail.login(username, password) + + # When the user is logged in, grab all the email from their inbox + # and parse all the messages for IOCs + if g.logged_in: + print '***** Login Successful. *****\n' + + self.handler.print_header(username) + emails = g.inbox().mail() + + for email in range(0, len(emails)): + try: + emails[email].fetch() + content = emails[email].body + subject = re.sub('(^\s|re:\s+|\r\n|fwd:\s+)', '', emails[email].subject, flags=re.IGNORECASE) + + self.parse_page(subject, content, 1, 1) + except Exception as e: + continue + + self.handler.print_footer(username) + + print '\n***** %s emails found. *****' % len(emails) + g.logout() + print '***** Logout Successful. *****' + else: + sys.exit() + except gmail.exceptions.AuthenticationError: + print 'Authentication Error' + sys.exit() + + def parse(self, path): try: if path.startswith('http://') or path.startswith('https://'): @@ -273,8 +436,14 @@ def parse(self, path): e = 'HTTP library not found: requests' raise ImportError(e) headers = { 'User-Agent': 'Mozilla/5.0 Gecko Firefox' } - r = requests.get(path, headers=headers) - r.raise_for_status() + + # If using proxy, make request with proxy from --proxy switch + # Otherwise make the call normally + if self.proxy is not None: + r = requests.get(path, headers=headers, proxies=self.proxy) + else: + r = requests.get(path, headers=headers) + f = StringIO(r.content) self.parser_func(f, path) return @@ -289,6 +458,15 @@ def parse(self, path): with open(fpath, 'rb') as f: self.parser_func(f, fpath) return + # Check if the input from CLI has @gmail.com attached + # If so, grab the credentials, and send them to parse_gmail() + elif path.count('@gmail.com ') == 1 and len(path.split()) == 2: + gmail_account = path.split() + username = gmail_account[0] + password = gmail_account[1] + self.parser_func(username, password) + + return e = 'File path is not a file, directory or URL: %s' % (path) raise IOError(e) @@ -299,13 +477,14 @@ def parse(self, path): if __name__ == "__main__": argparser = argparse.ArgumentParser() - argparser.add_argument('PATH', action='store', help='File/directory/URL to report(s)') + argparser.add_argument('PATH', action='store', help='File/directory/URL to report(s)/Gmail account in double quotes ("username@gmail.com password")') argparser.add_argument('-p', dest='INI', default=None, help='Pattern file') - argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html)') + argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html/csv/xls/xlsx/gmail)') argparser.add_argument('-o', dest='OUTPUT_FORMAT', default='csv', help='Output format (csv/json/yara/netflow)') argparser.add_argument('-d', dest='DEDUP', action='store_true', default=False, help='Deduplicate matches') argparser.add_argument('-l', dest='LIB', default='pdfminer', help='PDF parsing library (pypdf2/pdfminer)') + argparser.add_argument('--proxy', dest='PROXY', default=None, help='Sets proxy (http(s)://server:port)') args = argparser.parse_args() - parser = IOC_Parser(args.INI, args.INPUT_FORMAT, args.DEDUP, args.LIB, args.OUTPUT_FORMAT) + parser = IOC_Parser(args.INI, args.INPUT_FORMAT, args.DEDUP, args.LIB, args.OUTPUT_FORMAT, args.PROXY) parser.parse(args.PATH) diff --git a/output.py b/output.py index d71c92c..b5f8525 100644 --- a/output.py +++ b/output.py @@ -33,21 +33,46 @@ class OutputHandler_csv(OutputHandler): def __init__(self): self.csv_writer = csv.writer(sys.stdout, delimiter = '\t') - def print_match(self, fpath, page, name, match): - self.csv_writer.writerow((fpath, page, name, match)) + # Added flag and sheet which are unused but needed to make CSV output work + def print_match(self, fpath, page, name, match, flag, sheet=''): + self.csv_writer.writerow((fpath, page, name, match, sheet)) def print_error(self, fpath, exception): self.csv_writer.writerow((fpath, '0', 'error', exception)) -class OutputHandler_json(OutputHandler): - def print_match(self, fpath, page, name, match): - data = { - 'path' : fpath, - 'file' : os.path.basename(fpath), - 'page' : page, - 'type' : name, - 'match': match - } +class OutputHandler_json(OutputHandler): + def print_match(self, fpath, page, name, match, flag, sheet=''): + """ @param flag: + 0 = default (pdf/txt/html) + 1 = gmail + 2 = csv + 3 = xls and xlsx + @param sheet The sheet being parsed if Excel spreadsheet (single or multi-sheet) + """ + if flag == 0 or flag == 2: + data = { + 'path' : fpath, + 'file' : os.path.basename(fpath), + 'page' : page, + 'type' : name, + 'match': match + } + elif flag == 1: + data = { + 'input' : 'gmail', + 'subject' : fpath, + 'type' : name, + 'match': match + } + elif flag == 3: + data = { + 'path' : fpath, + 'file' : os.path.basename(fpath), + 'sheet' : sheet, + 'line' : page, + 'type' : name, + 'match': match, + } print(json.dumps(data)) @@ -65,7 +90,8 @@ class OutputHandler_yara(OutputHandler): def __init__(self): self.rule_enc = ''.join(chr(c) if chr(c).isupper() or chr(c).islower() or chr(c).isdigit() else '_' for c in range(256)) - def print_match(self, fpath, page, name, match): + # Added flag and sheet which are unused but needed to make YARA output work + def print_match(self, fpath, page, name, match, flag, sheet=''): if name in self.cnt: self.cnt[name] += 1 else: @@ -97,7 +123,8 @@ class OutputHandler_netflow(OutputHandler): def __init__(self): print "host 255.255.255.255" - def print_match(self, fpath, page, name, match): + # Added flag and sheet which are unused but needed to make Netflow output work + def print_match(self, fpath, page, name, match, flag, sheet=''): data = { 'type' : name, 'match': match