From 2b27eeaeba42003c5914f100a4ad3b505a71d7ba Mon Sep 17 00:00:00 2001 From: Dan Jensen Date: Fri, 1 Jun 2018 13:34:29 -0400 Subject: [PATCH] Add page_size to InfoExtractor Introduces extract_page_size and related test coverage. --- lib/docsplit.rb | 2 +- lib/docsplit/info_extractor.rb | 1 + test/unit/test_extract_info.rb | 7 ++++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/docsplit.rb b/lib/docsplit.rb index 1c49e91..7e013cc 100755 --- a/lib/docsplit.rb +++ b/lib/docsplit.rb @@ -12,7 +12,7 @@ module Docsplit ROOT = File.expand_path(File.dirname(__FILE__) + '/..') ESCAPED_ROOT = ESCAPE[ROOT] - METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length] + METADATA_KEYS = [:author, :date, :creator, :keywords, :page_size, :producer, :subject, :title, :length] GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"] diff --git a/lib/docsplit/info_extractor.rb b/lib/docsplit/info_extractor.rb index 2e34c85..c54dc6f 100644 --- a/lib/docsplit/info_extractor.rb +++ b/lib/docsplit/info_extractor.rb @@ -9,6 +9,7 @@ class InfoExtractor :date => /^CreationDate:\s+([^\n]+)/, :creator => /^Creator:\s+([^\n]+)/, :keywords => /^Keywords:\s+([^\n]+)/, + :page_size=> /^Page size:\s+([^\n]+)/, :producer => /^Producer:\s+([^\n]+)/, :subject => /^Subject:\s+([^\n]+)/, :title => /^Title:\s+([^\n]+)/, diff --git a/test/unit/test_extract_info.rb b/test/unit/test_extract_info.rb index 08fdd91..8f45428 100755 --- a/test/unit/test_extract_info.rb +++ b/test/unit/test_extract_info.rb @@ -23,6 +23,10 @@ def test_length assert 2 == Docsplit.extract_length('test/fixtures/obama_arts.pdf') end + def test_page_size + assert "612 x 792 pts (letter)" == Docsplit.extract_page_size('test/fixtures/encrypted.pdf') + end + def test_producer assert "Mac OS X 10.6.2 Quartz PDFContext" == Docsplit.extract_producer('test/fixtures/encrypted.pdf') end @@ -46,10 +50,11 @@ def test_extract_all assert metadata[:author] == "mkommareddi" assert metadata[:date] == "Thu Nov 29 14:54:46 2007" assert metadata[:creator] == "PScript5.dll Version 5.2" + assert metadata[:page_size] == "612 x 792 pts (letter)" assert metadata[:producer] == "Acrobat Distiller 8.1.0 (Windows)" assert metadata[:title] == "Microsoft Word - Fact Sheet Arts 112907 FINAL.doc" assert metadata[:length] == 2 - assert metadata.length == 6 + assert metadata.length == 7 end end