1- // Copyright 2017 Google Inc .
1+ // Copyright 2018 Google LLC .
22//
33// Licensed under the Apache License, Version 2.0 (the "License");
44// you may not use this file except in compliance with the License.
1111// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212// See the License for the specific language governing permissions and
1313// limitations under the License.
14+ //
1415
1516syntax = "proto3" ;
1617
@@ -101,6 +102,9 @@ message VideoContext {
101102
102103 // Config for SPEECH_TRANSCRIPTION.
103104 SpeechTranscriptionConfig speech_transcription_config = 6 ;
105+
106+ // Config for TEXT_DETECTION.
107+ TextDetectionConfig text_detection_config = 8 ;
104108}
105109
106110// Config for LABEL_DETECTION.
@@ -148,6 +152,16 @@ message FaceDetectionConfig {
148152 bool include_bounding_boxes = 2 ;
149153}
150154
155+ // Config for TEXT_DETECTION.
156+ message TextDetectionConfig {
157+ // Language hint can be specified if the language to be detected is known a
158+ // priori. It can increase the accuracy of the detection. Language hint must
159+ // be language code in BCP-47 format.
160+ //
161+ // Automatic language detection is performed if no hint is provided.
162+ repeated string language_hints = 1 ;
163+ }
164+
151165// Video segment.
152166message VideoSegment {
153167 // Time-offset, relative to the beginning of the video,
@@ -305,6 +319,14 @@ message VideoAnnotationResults {
305319 // Speech transcription.
306320 repeated SpeechTranscription speech_transcriptions = 11 ;
307321
322+ // OCR text detection and tracking.
323+ // Annotations for list of detected text snippets. Each will have list of
324+ // frame information associated with it.
325+ repeated TextAnnotation text_annotations = 12 ;
326+
327+ // Annotations for list of objects detected and tracked in video.
328+ repeated ObjectTrackingAnnotation object_annotations = 14 ;
329+
308330 // If set, indicates an error. Note that for a single `AnnotateVideoRequest`
309331 // some videos may succeed and some may fail.
310332 google.rpc.Status error = 9 ;
@@ -479,6 +501,115 @@ message WordInfo {
479501 int32 speaker_tag = 5 ;
480502}
481503
504+ // A vertex represents a 2D point in the image.
505+ // NOTE: the normalized vertex coordinates are relative to the original image
506+ // and range from 0 to 1.
507+ message NormalizedVertex {
508+ // X coordinate.
509+ float x = 1 ;
510+
511+ // Y coordinate.
512+ float y = 2 ;
513+ }
514+
515+ // Normalized bounding polygon for text (that might not be aligned with axis).
516+ // Contains list of the corner points in clockwise order starting from
517+ // top-left corner. For example, for a rectangular bounding box:
518+ // When the text is horizontal it might look like:
519+ // 0----1
520+ // | |
521+ // 3----2
522+ //
523+ // When it's clockwise rotated 180 degrees around the top-left corner it
524+ // becomes:
525+ // 2----3
526+ // | |
527+ // 1----0
528+ //
529+ // and the vertex order will still be (0, 1, 2, 3). Note that values can be less
530+ // than 0, or greater than 1 due to trignometric calculations for location of
531+ // the box.
532+ message NormalizedBoundingPoly {
533+ // Normalized vertices of the bounding polygon.
534+ repeated NormalizedVertex vertices = 1 ;
535+ }
536+
537+ // Video segment level annotation results for text detection.
538+ message TextSegment {
539+ // Video segment where a text snippet was detected.
540+ VideoSegment segment = 1 ;
541+
542+ // Confidence for the track of detected text. It is calculated as the highest
543+ // over all frames where OCR detected text appears.
544+ float confidence = 2 ;
545+
546+ // Information related to the frames where OCR detected text appears.
547+ repeated TextFrame frames = 3 ;
548+ }
549+
550+ // Video frame level annotation results for text annotation (OCR).
551+ // Contains information regarding timestamp and bounding box locations for the
552+ // frames containing detected OCR text snippets.
553+ message TextFrame {
554+ // Bounding polygon of the detected text for this frame.
555+ NormalizedBoundingPoly rotated_bounding_box = 1 ;
556+
557+ // Timestamp of this frame.
558+ google.protobuf.Duration time_offset = 2 ;
559+ }
560+
561+ // Annotations related to one detected OCR text snippet. This will contain the
562+ // corresponding text, confidence value, and frame level information for each
563+ // detection.
564+ message TextAnnotation {
565+ // The detected text.
566+ string text = 1 ;
567+
568+ // All video segments where OCR detected text appears.
569+ repeated TextSegment segments = 2 ;
570+ }
571+
572+ // Video frame level annotations for object detection and tracking. This field
573+ // stores per frame location, time offset, and confidence.
574+ message ObjectTrackingFrame {
575+ // The normalized bounding box location of this object track for the frame.
576+ NormalizedBoundingBox normalized_bounding_box = 1 ;
577+
578+ // The timestamp of the frame in microseconds.
579+ google.protobuf.Duration time_offset = 2 ;
580+ }
581+
582+ // Annotations corresponding to one tracked object.
583+ message ObjectTrackingAnnotation {
584+ // Different representation of tracking info in non-streaming batch
585+ // and streaming modes.
586+ oneof track_info {
587+ // Non-streaming batch mode ONLY.
588+ // Each object track corresponds to one video segment where it appears.
589+ VideoSegment segment = 3 ;
590+
591+ // Streaming mode ONLY.
592+ // In streaming mode, we do not know the end time of a tracked object
593+ // before it is completed. Hence, there is no VideoSegment info returned.
594+ // Instead, we provide a unique identifiable integer track_id so that
595+ // the customers can correlate the results of the ongoing
596+ // ObjectTrackAnnotation of the same track_id over time.
597+ int64 track_id = 5 ;
598+ }
599+
600+ // Entity to specify the object category that this track is labeled as.
601+ Entity entity = 1 ;
602+
603+ // Object category's labeling confidence of this track.
604+ float confidence = 4 ;
605+
606+ // Information corresponding to all frames where this object track appears.
607+ // Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame
608+ // messages in frames.
609+ // Streaming mode: it can only be one ObjectTrackingFrame message in frames.
610+ repeated ObjectTrackingFrame frames = 2 ;
611+ }
612+
482613// Video annotation feature.
483614enum Feature {
484615 // Unspecified.
@@ -498,6 +629,12 @@ enum Feature {
498629
499630 // Speech transcription.
500631 SPEECH_TRANSCRIPTION = 6 ;
632+
633+ // OCR text detection and tracking.
634+ TEXT_DETECTION = 7 ;
635+
636+ // Object detection and tracking.
637+ OBJECT_TRACKING = 9 ;
501638}
502639
503640// Label detection mode.
0 commit comments