From 34cbcc34c6f5136d44f07e72a62a63349c402957 Mon Sep 17 00:00:00 2001 From: nnegrey Date: Mon, 18 Nov 2019 09:54:06 -0700 Subject: [PATCH 1/2] Add samples for speech diarization ga (auto-punctuation samples already on v1) --- speech/cloud-client/pom.xml | 2 +- .../example/speech/TranscribeDiarization.java | 99 +++++++++++++++++++ .../speech/TranscribeDiarizationGcs.java | 99 +++++++++++++++++++ .../speech/TranscribeDiarizationIT.java | 84 ++++++++++++++++ 4 files changed, 283 insertions(+), 1 deletion(-) create mode 100644 speech/cloud-client/src/main/java/com/example/speech/TranscribeDiarization.java create mode 100644 speech/cloud-client/src/main/java/com/example/speech/TranscribeDiarizationGcs.java create mode 100644 speech/cloud-client/src/test/java/com/example/speech/TranscribeDiarizationIT.java diff --git a/speech/cloud-client/pom.xml b/speech/cloud-client/pom.xml index f8ed723173c..bac92be3f09 100644 --- a/speech/cloud-client/pom.xml +++ b/speech/cloud-client/pom.xml @@ -40,7 +40,7 @@ com.google.cloud google-cloud-speech - 1.13.0 + 1.22.0 diff --git a/speech/cloud-client/src/main/java/com/example/speech/TranscribeDiarization.java b/speech/cloud-client/src/main/java/com/example/speech/TranscribeDiarization.java new file mode 100644 index 00000000000..5a590d132d6 --- /dev/null +++ b/speech/cloud-client/src/main/java/com/example/speech/TranscribeDiarization.java @@ -0,0 +1,99 @@ +/* + * Copyright 2019 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.example.speech; + +// [START speech_transcribe_diarization] + +import com.google.cloud.speech.v1.RecognitionAudio; +import com.google.cloud.speech.v1.RecognitionConfig; +import com.google.cloud.speech.v1.RecognizeResponse; +import com.google.cloud.speech.v1.SpeakerDiarizationConfig; +import com.google.cloud.speech.v1.SpeechClient; +import com.google.cloud.speech.v1.SpeechRecognitionAlternative; +import com.google.cloud.speech.v1.WordInfo; +import com.google.protobuf.ByteString; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + +class TranscribeDiarization { + + static void transcribeDiarization() throws IOException { + // TODO(developer): Replace these variables before running the sample. + String fileName = "resources/commercial_mono.wav"; + transcribeDiarization(fileName); + } + + // Transcribe the given audio file using speaker diarization. + static void transcribeDiarization(String fileName) throws IOException { + Path path = Paths.get(fileName); + byte[] content = Files.readAllBytes(path); + + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. After completing all of your requests, call + // the "close" method on the client to safely clean up any remaining background resources. + try (SpeechClient client = SpeechClient.create()) { + // Get the contents of the local audio file + RecognitionAudio recognitionAudio = + RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); + SpeakerDiarizationConfig speakerDiarizationConfig = SpeakerDiarizationConfig.newBuilder() + .setEnableSpeakerDiarization(true) + .setMinSpeakerCount(2) + .setMaxSpeakerCount(2) + .build(); + // Configure request to enable Speaker diarization + RecognitionConfig config = RecognitionConfig.newBuilder() + .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(8000) + .setDiarizationConfig(speakerDiarizationConfig) + .build(); + + // Perform the transcription request + RecognizeResponse recognizeResponse = client.recognize(config, recognitionAudio); + + // Speaker Tags are only included in the last result object, which has only one alternative. + SpeechRecognitionAlternative alternative = + recognizeResponse.getResults( + recognizeResponse.getResultsCount() - 1).getAlternatives(0); + // The alternative is made up of WordInfo objects that contain the speaker_tag. + WordInfo wordInfo = alternative.getWords(0); + int currentSpeakerTag = wordInfo.getSpeakerTag(); + // For each word, get all the words associated with one speaker, once the speaker changes, + // add a new line with the new speaker and their spoken words. + StringBuilder speakerWords = new StringBuilder( + String.format("Speaker %d: %s", wordInfo.getSpeakerTag(), wordInfo.getWord())); + for (int i = 1; i < alternative.getWordsCount(); i++) { + wordInfo = alternative.getWords(i); + if (currentSpeakerTag == wordInfo.getSpeakerTag()) { + speakerWords.append(" "); + speakerWords.append(wordInfo.getWord()); + } else { + speakerWords.append( + String.format("\nSpeaker %d: %s", + wordInfo.getSpeakerTag(), + wordInfo.getWord())); + currentSpeakerTag = wordInfo.getSpeakerTag(); + } + } + System.out.println(speakerWords.toString()); + } + } +} +// [END speech_transcribe_diarization] diff --git a/speech/cloud-client/src/main/java/com/example/speech/TranscribeDiarizationGcs.java b/speech/cloud-client/src/main/java/com/example/speech/TranscribeDiarizationGcs.java new file mode 100644 index 00000000000..057fc803efa --- /dev/null +++ b/speech/cloud-client/src/main/java/com/example/speech/TranscribeDiarizationGcs.java @@ -0,0 +1,99 @@ +/* + * Copyright 2019 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.example.speech; + +// [START speech_transcribe_diarization_gcs] + +import com.google.api.gax.longrunning.OperationFuture; +import com.google.cloud.speech.v1.LongRunningRecognizeMetadata; +import com.google.cloud.speech.v1.LongRunningRecognizeResponse; +import com.google.cloud.speech.v1.RecognitionAudio; +import com.google.cloud.speech.v1.RecognitionConfig; +import com.google.cloud.speech.v1.SpeakerDiarizationConfig; +import com.google.cloud.speech.v1.SpeechClient; +import com.google.cloud.speech.v1.SpeechRecognitionAlternative; +import com.google.cloud.speech.v1.WordInfo; + +import java.io.IOException; +import java.util.concurrent.ExecutionException; + +public class TranscribeDiarizationGcs { + + static void transcribeDiarizationGcs() throws IOException, ExecutionException, InterruptedException { + // TODO(developer): Replace these variables before running the sample. + String gcsUri = "gs://cloud-samples-data/speech/commercial_mono.wav"; + transcribeDiarizationGcs(gcsUri); + } + + // Transcribe the give gcs file using speaker diarization + public static void transcribeDiarizationGcs(String gcsUri) throws IOException, ExecutionException, InterruptedException { + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. After completing all of your requests, call + // the "close" method on the client to safely clean up any remaining background resources. + try (SpeechClient speechClient = SpeechClient.create()) { + SpeakerDiarizationConfig speakerDiarizationConfig = SpeakerDiarizationConfig.newBuilder() + .setEnableSpeakerDiarization(true) + .setMinSpeakerCount(2) + .setMaxSpeakerCount(2) + .build(); + // Configure request to enable Speaker diarization + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(8000) + .setDiarizationConfig(speakerDiarizationConfig) + .build(); + // Set the remote path for the audio file + RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); + + // Use non-blocking call for getting file transcription + OperationFuture future = + speechClient.longRunningRecognizeAsync(config, audio); + System.out.println("Waiting for response..."); + + // Speaker Tags are only included in the last result object, which has only one alternative. + LongRunningRecognizeResponse response = future.get(); + SpeechRecognitionAlternative alternative = + response.getResults( + response.getResultsCount() - 1) + .getAlternatives(0); + // The alternative is made up of WordInfo objects that contain the speaker_tag. + WordInfo wordInfo = alternative.getWords(0); + int currentSpeakerTag = wordInfo.getSpeakerTag(); + // For each word, get all the words associated with one speaker, once the speaker changes, + // add a new line with the new speaker and their spoken words. + StringBuilder speakerWords = new StringBuilder( + String.format("Speaker %d: %s", wordInfo.getSpeakerTag(), wordInfo.getWord())); + for (int i = 1; i < alternative.getWordsCount(); i++) { + wordInfo = alternative.getWords(i); + if (currentSpeakerTag == wordInfo.getSpeakerTag()) { + speakerWords.append(" "); + speakerWords.append(wordInfo.getWord()); + } else { + speakerWords.append( + String.format("\nSpeaker %d: %s", + wordInfo.getSpeakerTag(), + wordInfo.getWord())); + currentSpeakerTag = wordInfo.getSpeakerTag(); + } + } + System.out.println(speakerWords.toString()); + } + } +} +// [END speech_transcribe_diarization_gcs] diff --git a/speech/cloud-client/src/test/java/com/example/speech/TranscribeDiarizationIT.java b/speech/cloud-client/src/test/java/com/example/speech/TranscribeDiarizationIT.java new file mode 100644 index 00000000000..15ef8fd889f --- /dev/null +++ b/speech/cloud-client/src/test/java/com/example/speech/TranscribeDiarizationIT.java @@ -0,0 +1,84 @@ +/* + * Copyright 2018 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.example.speech; + +import static com.google.common.truth.Truth.assertThat; +import static junit.framework.TestCase.assertNotNull; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.concurrent.ExecutionException; + +import org.junit.After; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +// Tests for speech Transcribe Diarization samples. +@RunWith(JUnit4.class) +@SuppressWarnings("checkstyle:abbreviationaswordinname") +public class TranscribeDiarizationIT { + private ByteArrayOutputStream bout; + private PrintStream out; + + // The path to the audio file to transcribe + private String recognitionAudioFile = "./resources/commercial_mono.wav"; + + private static void requireEnvVar(String varName) { + assertNotNull( + System.getenv(varName), + "Environment variable '%s' is required to perform these tests.".format(varName) + ); + } + + @BeforeClass + public static void checkRequirements() { + requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS"); + } + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + System.setOut(out); + } + + @After + public void tearDown() { + System.setOut(null); + } + + @Test + public void testDiarization() throws IOException { + TranscribeDiarization.transcribeDiarization(recognitionAudioFile); + String got = bout.toString(); + assertThat(got).contains("Speaker 1: I'm here"); + assertThat(got).contains("Speaker 2: Hi, I'd like to buy a"); + } + + @Test + public void testDiarizationGcs() throws IOException, ExecutionException, InterruptedException { + TranscribeDiarizationGcs.transcribeDiarizationGcs( + "gs://cloud-samples-data/speech/commercial_mono.wav"); + String got = bout.toString(); + assertThat(got).contains("Speaker 1: I'm here"); + assertThat(got).contains("Speaker 2: Hi, I'd like to buy a"); + } +} From 5df92dd4956f799f8bdbda6a05708c796762afae Mon Sep 17 00:00:00 2001 From: nnegrey Date: Mon, 18 Nov 2019 10:06:45 -0700 Subject: [PATCH 2/2] Lint fix --- .../java/com/example/speech/TranscribeDiarizationGcs.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/speech/cloud-client/src/main/java/com/example/speech/TranscribeDiarizationGcs.java b/speech/cloud-client/src/main/java/com/example/speech/TranscribeDiarizationGcs.java index 057fc803efa..de55cc44ea6 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/TranscribeDiarizationGcs.java +++ b/speech/cloud-client/src/main/java/com/example/speech/TranscribeDiarizationGcs.java @@ -33,14 +33,16 @@ public class TranscribeDiarizationGcs { - static void transcribeDiarizationGcs() throws IOException, ExecutionException, InterruptedException { + static void transcribeDiarizationGcs() throws IOException, ExecutionException, + InterruptedException { // TODO(developer): Replace these variables before running the sample. String gcsUri = "gs://cloud-samples-data/speech/commercial_mono.wav"; transcribeDiarizationGcs(gcsUri); } // Transcribe the give gcs file using speaker diarization - public static void transcribeDiarizationGcs(String gcsUri) throws IOException, ExecutionException, InterruptedException { + public static void transcribeDiarizationGcs(String gcsUri) throws IOException, + ExecutionException, InterruptedException { // Initialize client that will be used to send requests. This client only needs to be created // once, and can be reused for multiple requests. After completing all of your requests, call // the "close" method on the client to safely clean up any remaining background resources.