Skip to content

Commit b4d993c

Browse files
yoshi-automationJustinBeckwith
authored andcommitted
feat: add text detection and object mapping support (#197)
1 parent e33937e commit b4d993c

File tree

3 files changed

+372
-7
lines changed

3 files changed

+372
-7
lines changed

packages/google-cloud-videointelligence/protos/google/cloud/videointelligence/v1/video_intelligence.proto

Lines changed: 138 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2017 Google Inc.
1+
// Copyright 2018 Google LLC.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
1111
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
14+
//
1415

1516
syntax = "proto3";
1617

@@ -101,6 +102,9 @@ message VideoContext {
101102

102103
// Config for SPEECH_TRANSCRIPTION.
103104
SpeechTranscriptionConfig speech_transcription_config = 6;
105+
106+
// Config for TEXT_DETECTION.
107+
TextDetectionConfig text_detection_config = 8;
104108
}
105109

106110
// Config for LABEL_DETECTION.
@@ -148,6 +152,16 @@ message FaceDetectionConfig {
148152
bool include_bounding_boxes = 2;
149153
}
150154

155+
// Config for TEXT_DETECTION.
156+
message TextDetectionConfig {
157+
// Language hint can be specified if the language to be detected is known a
158+
// priori. It can increase the accuracy of the detection. Language hint must
159+
// be language code in BCP-47 format.
160+
//
161+
// Automatic language detection is performed if no hint is provided.
162+
repeated string language_hints = 1;
163+
}
164+
151165
// Video segment.
152166
message VideoSegment {
153167
// Time-offset, relative to the beginning of the video,
@@ -305,6 +319,14 @@ message VideoAnnotationResults {
305319
// Speech transcription.
306320
repeated SpeechTranscription speech_transcriptions = 11;
307321

322+
// OCR text detection and tracking.
323+
// Annotations for list of detected text snippets. Each will have list of
324+
// frame information associated with it.
325+
repeated TextAnnotation text_annotations = 12;
326+
327+
// Annotations for list of objects detected and tracked in video.
328+
repeated ObjectTrackingAnnotation object_annotations = 14;
329+
308330
// If set, indicates an error. Note that for a single `AnnotateVideoRequest`
309331
// some videos may succeed and some may fail.
310332
google.rpc.Status error = 9;
@@ -479,6 +501,115 @@ message WordInfo {
479501
int32 speaker_tag = 5;
480502
}
481503

504+
// A vertex represents a 2D point in the image.
505+
// NOTE: the normalized vertex coordinates are relative to the original image
506+
// and range from 0 to 1.
507+
message NormalizedVertex {
508+
// X coordinate.
509+
float x = 1;
510+
511+
// Y coordinate.
512+
float y = 2;
513+
}
514+
515+
// Normalized bounding polygon for text (that might not be aligned with axis).
516+
// Contains list of the corner points in clockwise order starting from
517+
// top-left corner. For example, for a rectangular bounding box:
518+
// When the text is horizontal it might look like:
519+
// 0----1
520+
// | |
521+
// 3----2
522+
//
523+
// When it's clockwise rotated 180 degrees around the top-left corner it
524+
// becomes:
525+
// 2----3
526+
// | |
527+
// 1----0
528+
//
529+
// and the vertex order will still be (0, 1, 2, 3). Note that values can be less
530+
// than 0, or greater than 1 due to trignometric calculations for location of
531+
// the box.
532+
message NormalizedBoundingPoly {
533+
// Normalized vertices of the bounding polygon.
534+
repeated NormalizedVertex vertices = 1;
535+
}
536+
537+
// Video segment level annotation results for text detection.
538+
message TextSegment {
539+
// Video segment where a text snippet was detected.
540+
VideoSegment segment = 1;
541+
542+
// Confidence for the track of detected text. It is calculated as the highest
543+
// over all frames where OCR detected text appears.
544+
float confidence = 2;
545+
546+
// Information related to the frames where OCR detected text appears.
547+
repeated TextFrame frames = 3;
548+
}
549+
550+
// Video frame level annotation results for text annotation (OCR).
551+
// Contains information regarding timestamp and bounding box locations for the
552+
// frames containing detected OCR text snippets.
553+
message TextFrame {
554+
// Bounding polygon of the detected text for this frame.
555+
NormalizedBoundingPoly rotated_bounding_box = 1;
556+
557+
// Timestamp of this frame.
558+
google.protobuf.Duration time_offset = 2;
559+
}
560+
561+
// Annotations related to one detected OCR text snippet. This will contain the
562+
// corresponding text, confidence value, and frame level information for each
563+
// detection.
564+
message TextAnnotation {
565+
// The detected text.
566+
string text = 1;
567+
568+
// All video segments where OCR detected text appears.
569+
repeated TextSegment segments = 2;
570+
}
571+
572+
// Video frame level annotations for object detection and tracking. This field
573+
// stores per frame location, time offset, and confidence.
574+
message ObjectTrackingFrame {
575+
// The normalized bounding box location of this object track for the frame.
576+
NormalizedBoundingBox normalized_bounding_box = 1;
577+
578+
// The timestamp of the frame in microseconds.
579+
google.protobuf.Duration time_offset = 2;
580+
}
581+
582+
// Annotations corresponding to one tracked object.
583+
message ObjectTrackingAnnotation {
584+
// Different representation of tracking info in non-streaming batch
585+
// and streaming modes.
586+
oneof track_info {
587+
// Non-streaming batch mode ONLY.
588+
// Each object track corresponds to one video segment where it appears.
589+
VideoSegment segment = 3;
590+
591+
// Streaming mode ONLY.
592+
// In streaming mode, we do not know the end time of a tracked object
593+
// before it is completed. Hence, there is no VideoSegment info returned.
594+
// Instead, we provide a unique identifiable integer track_id so that
595+
// the customers can correlate the results of the ongoing
596+
// ObjectTrackAnnotation of the same track_id over time.
597+
int64 track_id = 5;
598+
}
599+
600+
// Entity to specify the object category that this track is labeled as.
601+
Entity entity = 1;
602+
603+
// Object category's labeling confidence of this track.
604+
float confidence = 4;
605+
606+
// Information corresponding to all frames where this object track appears.
607+
// Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame
608+
// messages in frames.
609+
// Streaming mode: it can only be one ObjectTrackingFrame message in frames.
610+
repeated ObjectTrackingFrame frames = 2;
611+
}
612+
482613
// Video annotation feature.
483614
enum Feature {
484615
// Unspecified.
@@ -498,6 +629,12 @@ enum Feature {
498629

499630
// Speech transcription.
500631
SPEECH_TRANSCRIPTION = 6;
632+
633+
// OCR text detection and tracking.
634+
TEXT_DETECTION = 7;
635+
636+
// Object detection and tracking.
637+
OBJECT_TRACKING = 9;
501638
}
502639

503640
// Label detection mode.

0 commit comments

Comments
 (0)