documentcloud · jhosteny · May 16, 2013 · May 17, 2013
diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb
@@ -91,6 +91,9 @@ def parse_options
         opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
           @options[:ocr] = o
         end
+        opts.on('--hocr', 'force hOCR output when OCR enabled') do |h|
+          @options[:hocr] = h
+        end
         opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
           @options[:clean] = false
         end
@@ -119,4 +122,4 @@ def parse_options
 
   end
 
-end
+end
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
@@ -60,21 +60,25 @@ def extract_from_ocr(pdf, pages)
       tempdir = Dir.mktmpdir
       base_path = File.join(@output, @pdf_name)
       escaped_pdf = ESCAPE[pdf]
+      additional_opts = ""
+      additional_opts += "hocr " if @use_hocr
       if pages
         pages.each do |page|
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
-          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
+          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{additional_opts} 2>&1"
           clean_text(file + '.txt') if @clean_ocr
+          run "cp #{escaped_tiff} #{base_path}_#{page}.tif" if @use_hocr
           FileUtils.remove_entry_secure tiff
         end
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
-        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
+        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{additional_opts} 2>&1"
+        run "cp #{escaped_tiff} #{base_path}.tif" if @use_hocr
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
@@ -120,11 +124,12 @@ def extract_options(options)
       @output     = options[:output] || '.'
       @pages      = options[:pages]
       @force_ocr  = options[:ocr] == true
+      @use_hocr   = options[:hocr] == true
       @forbid_ocr = options[:ocr] == false
-      @clean_ocr  = !(options[:clean] == false)
+      @clean_ocr  = !(options[:clean] == false) && !@use_hocr
       @language   = options[:language] || 'eng'
     end
 
   end
 
-end
+end