diff --git a/bin/iocp b/bin/iocp index 8047f6f..832d6b4 100755 --- a/bin/iocp +++ b/bin/iocp @@ -43,7 +43,7 @@ if __name__ == "__main__": argparser = argparse.ArgumentParser() argparser.add_argument('PATH', action='store', help='File/directory/URL to report(s)') argparser.add_argument('-p', dest='INI', default=None, help='Pattern file') - argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html)') + argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html/docx)') argparser.add_argument('-o', dest='OUTPUT_FORMAT', default='csv', help='Output format (csv/tsv/json/yara/netflow)') argparser.add_argument('-d', dest='DEDUP', action='store_true', default=False, help='Deduplicate matches') argparser.add_argument('-l', dest='LIB', default='pdfminer', help='PDF parsing library (pypdf2/pdfminer)') diff --git a/iocp/Parser.py b/iocp/Parser.py index f8d084a..8f96bd5 100644 --- a/iocp/Parser.py +++ b/iocp/Parser.py @@ -75,6 +75,11 @@ IMPORTS.append('requests') except ImportError: pass +try: + import docx2txt + IMPORTS.append('docx2txt') +except ImportError: + pass # Import project source files import iocp @@ -276,6 +281,19 @@ def parse_html(self, f, fpath): except (KeyboardInterrupt, SystemExit): raise + def parse_docx(self, f, fpath): + try: + text = docx2txt.process(f) + + if self.dedup: + self.dedup_store = set() + + self.handler.print_header(fpath) + self.parse_page(fpath, text, 1) + self.handler.print_footer(fpath) + except (KeyboardInterrupt, SystemExit): + raise + def parse(self, path): try: if path.startswith('http://') or path.startswith('https://'): diff --git a/requirements.txt b/requirements.txt index 5ff7a2e..29e7e1e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ beautifulsoup4>=4.4.1 pdfminer>=20140328 PyPDF2>=1.26.0 requests>=2.10.0 +docx2txt>=0.6 \ No newline at end of file