Merge pull request #13 from google/remove-uri-path

mbernico · web-flow · commit f03063c76393 · 2020-08-10T11:50:56.000-05:00
Change image_uri to image_name (file basename) in TFRecord output.
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-all: init pylint coverage test
+all: init test pylint
 
 init:
 	pip install -r requirements.txt
diff --git a/requirements.txt b/requirements.txt
@@ -9,4 +9,5 @@ pylint >= 2.5.3
 fire >= 0.3.1
 jupyter >= 1.0.0
 tensorflow >= 2.2.0
-pyarrow < 0.17
+pyarrow < 0.17
+frozendict >= 1.2
diff --git a/tfrecorder/beam_image.py b/tfrecorder/beam_image.py
@@ -17,6 +17,7 @@
 
 import base64
 import logging
+import os
 from typing import Any, Dict, Generator, Tuple
 
 import apache_beam as beam
@@ -73,10 +74,10 @@ def load(image_uri):
 class ExtractImagesDoFn(beam.DoFn):
   """Adds image to PCollection."""
 
-  def __init__(self, image_key: str):
+  def __init__(self, image_uri_key: str):
     """Constructor."""
     super().__init__()
-    self.image_key = image_key
+    self.image_uri_key = image_uri_key
     self.image_good_counter = Metrics.counter(self.__class__, 'image_good')
     self.image_bad_counter = Metrics.counter(self.__class__, 'image_bad')
 
@@ -95,9 +96,9 @@ def process(
     d = {}
 
     try:
-      image_uri = element[self.image_key]
+      image_uri = element.pop(self.image_uri_key)
       image = load(image_uri)
-      # TODO(cezequiel): Remove path from image_uri -> image_name
+      element['image_name'] = os.path.split(image_uri)[-1]
       d['image'] = encode(image)
       d['image_width'], d['image_height'] = image.size
       d['image_channels'] = mode_to_channel(image.mode)
diff --git a/tfrecorder/beam_image_test.py b/tfrecorder/beam_image_test.py
@@ -112,6 +112,6 @@ def _equal(actual):
                   .format(actual_keys, expected_keys_))
         return _equal
 
-      expected_keys = ['image_uri', 'label', 'split', 'image',
+      expected_keys = ['image_name', 'label', 'split', 'image',
                        'image_height', 'image_width', 'image_channels']
       util.assert_that(data, key_matcher(expected_keys))
diff --git a/tfrecorder/check.py b/tfrecorder/check.py
@@ -100,8 +100,8 @@ def check_tfrecords(
       writer.writerow(row)
 
       # Save image data to a file
-      if 'image_uri' in r:
-        _, image_filename = os.path.split(_stringify(r['image_uri']))
+      if 'image_name' in r:
+        _, image_filename = os.path.split(_stringify(r['image_name']))
         image_path = os.path.join(data_dir, image_filename)
         _save_image_from_record(r, image_path)
 
diff --git a/tfrecorder/check_test.py b/tfrecorder/check_test.py
@@ -86,6 +86,8 @@ def setUp(self):
 
     data = test_utils.get_test_data()
     num_records = len(data[constants.IMAGE_URI_KEY])
+    image_uris = data.pop(constants.IMAGE_URI_KEY)
+    data['image_name'] = [os.path.split(uri)[-1] for uri in image_uris]
     data.update({
         'image': [beam_image.encode(image_fn())
                   for _ in range(num_records)],
@@ -123,8 +125,7 @@ def test_valid_records(self, mock_fn):
       # Check output images
       actual_image_files = [
           f for f in os.listdir(actual_dir) if f.endswith('.jpg')]
-      expected_image_files = [
-          os.path.split(f)[-1] for f in self.data['image_uri']]
+      expected_image_files = self.data['image_name']
       self.assertCountEqual(actual_image_files, expected_image_files)
 
 
diff --git a/tfrecorder/constants.py b/tfrecorder/constants.py
@@ -16,9 +16,9 @@
 
 """Global constants."""
 
-import collections
 import logging
 
+import frozendict
 import tensorflow as tf
 from tensorflow_transform.tf_metadata import dataset_metadata
 from tensorflow_transform.tf_metadata import schema_utils
@@ -31,14 +31,16 @@
 LABEL_KEY = 'label'
 IMAGE_CSV_COLUMNS = [SPLIT_KEY, IMAGE_URI_KEY, LABEL_KEY]
 
-IMAGE_CSV_FEATURE_SPEC = {
+IMAGE_CSV_FEATURE_SPEC = frozendict.FrozenOrderedDict({
     SPLIT_KEY: tf.io.FixedLenFeature([], tf.string),
     IMAGE_URI_KEY: tf.io.FixedLenFeature([], tf.string),
     LABEL_KEY: tf.io.FixedLenFeature([], tf.string),
-}
+})
 
-RAW_FEATURE_SPEC = collections.OrderedDict(IMAGE_CSV_FEATURE_SPEC)
-RAW_FEATURE_SPEC.update({
+RAW_FEATURE_SPEC = frozendict.FrozenOrderedDict({
+    SPLIT_KEY: tf.io.FixedLenFeature([], tf.string),
+    LABEL_KEY: tf.io.FixedLenFeature([], tf.string),
+    'image_name': tf.io.FixedLenFeature([], tf.string),
     'image': tf.io.FixedLenFeature([], tf.string),
     'image_height': tf.io.FixedLenFeature([], tf.int64),
     'image_width': tf.io.FixedLenFeature([], tf.int64),
diff --git a/tfrecorder/test_data/sample_tfrecords/schema.pbtxt b/tfrecorder/test_data/sample_tfrecords/schema.pbtxt
@@ -26,7 +26,7 @@ feature {
   }
 }
 feature {
-  name: "image_uri"
+  name: "image_name"
   type: BYTES
   presence {
     min_fraction: 1.0
diff --git a/tfrecorder/test_data/sample_tfrecords/test-00000-of-00001.tfrecord.gz b/tfrecorder/test_data/sample_tfrecords/test-00000-of-00001.tfrecord.gz
diff --git a/tfrecorder/test_data/sample_tfrecords/train-00000-of-00001.tfrecord.gz b/tfrecorder/test_data/sample_tfrecords/train-00000-of-00001.tfrecord.gz
diff --git a/tfrecorder/test_data/sample_tfrecords/transform_fn/saved_model.pb b/tfrecorder/test_data/sample_tfrecords/transform_fn/saved_model.pb
diff --git a/tfrecorder/test_data/sample_tfrecords/transformed_metadata/schema.pbtxt b/tfrecorder/test_data/sample_tfrecords/transformed_metadata/schema.pbtxt
@@ -26,7 +26,7 @@ feature {
   }
 }
 feature {
-  name: "image_uri"
+  name: "image_name"
   type: BYTES
   presence {
     min_fraction: 1.0
diff --git a/tfrecorder/test_data/sample_tfrecords/val-00000-of-00001.tfrecord.gz b/tfrecorder/test_data/sample_tfrecords/val-00000-of-00001.tfrecord.gz

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-all: init pylint coverage test`
	`1`	`+all: init test pylint`
`2`	`2`
`3`	`3`	`init:`
`4`	`4`	`pip install -r requirements.txt`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ feature {`
`26`	`26`	`}`
`27`	`27`	`}`
`28`	`28`	`feature {`
`29`		`- name: "image_uri"`
	`29`	`+ name: "image_name"`
`30`	`30`	`type: BYTES`
`31`	`31`	`presence {`
`32`	`32`	`min_fraction: 1.0`