Add additional filter graph option to StreamWriter (#3194)

mthrok · facebook-github-bot · commit 3155d0e76bf9 · 2023-03-27T11:03:12.000-07:00
Summary: Pull Request resolved: #3194 Differential Revision: D44283910 Pulled By: mthrok fbshipit-source-id: 9f73bcabdeb01b88220d47809ef860e854878c85
diff --git a/test/torchaudio_unittest/io/stream_writer_test.py b/test/torchaudio_unittest/io/stream_writer_test.py
@@ -580,3 +580,57 @@ def write_audio(buffer, bit_rate):
         out1_size = dst.tell()
 
         self.assertGreater(out1_size, out0_size)
+
+    def test_filter_graph_audio(self):
+        """Can apply additional effect with filter graph"""
+        sample_rate = 8000
+        num_channels = 2
+        ext = "wav"
+        filename = f"test.{ext}"
+
+        original = get_audio_chunk("s16", num_channels=num_channels, sample_rate=sample_rate)
+
+        dst = self.get_dst(filename)
+        w = StreamWriter(dst, format=ext)
+        w.add_audio_stream(sample_rate=8000, num_channels=num_channels, filter_desc="areverse", format="s16")
+
+        with w.open():
+            w.write_audio_chunk(0, original)
+
+        # check
+        if self.test_fileobj:
+            dst.flush()
+
+        reader = torchaudio.io.StreamReader(src=self.get_temp_path(filename))
+        reader.add_audio_stream(-1)
+        reader.process_all_packets()
+        (output,) = reader.pop_chunks()
+
+        self.assertEqual(output, original.flip(0))
+
+    def test_filter_graph_video(self):
+        """Can apply additional effect with filter graph"""
+        rate = 30
+        num_frames, width, height = 400, 160, 90
+        ext = "mp4"
+        filename = f"test.{ext}"
+
+        original = torch.zeros((num_frames, 3, height, width), dtype=torch.uint8)
+
+        dst = self.get_dst(filename)
+        w = StreamWriter(dst, format=ext)
+        w.add_video_stream(frame_rate=rate, format="rgb24", height=height, width=width, filter_desc="framestep=2")
+
+        with w.open():
+            w.write_video_chunk(0, original)
+
+        # check
+        if self.test_fileobj:
+            dst.flush()
+
+        reader = torchaudio.io.StreamReader(src=self.get_temp_path(filename))
+        reader.add_video_stream(-1)
+        reader.process_all_packets()
+        (output,) = reader.pop_chunks()
+
+        self.assertEqual(output.shape, [num_frames // 2, 3, height, width])
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp b/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp
@@ -513,19 +513,26 @@ FilterGraph get_audio_filter_graph(
     AVSampleFormat src_fmt,
     int sample_rate,
     uint64_t channel_layout,
+    const c10::optional<std::string>& filter_desc,
     AVSampleFormat enc_fmt,
     int nb_samples) {
-  const std::string filter_desc = [&]() -> const std::string {
+  const std::string desc = [&]() -> const std::string {
     if (src_fmt == enc_fmt) {
       if (nb_samples == 0) {
-        return "anull";
+        return filter_desc.value_or("anull");
       } else {
         std::stringstream ss;
+        if (filter_desc) {
+          ss << filter_desc.value() << ",";
+        }
         ss << "asetnsamples=n=" << nb_samples << ":p=0";
         return ss.str();
       }
     } else {
       std::stringstream ss;
+      if (filter_desc) {
+        ss << filter_desc.value() << ",";
+      }
       ss << "aformat=" << av_get_sample_fmt_name(enc_fmt);
       if (nb_samples > 0) {
         ss << ",asetnsamples=n=" << nb_samples << ":p=0";
@@ -537,7 +544,7 @@ FilterGraph get_audio_filter_graph(
   FilterGraph f{AVMEDIA_TYPE_AUDIO};
   f.add_audio_src(src_fmt, {1, sample_rate}, sample_rate, channel_layout);
   f.add_sink();
-  f.add_process(filter_desc);
+  f.add_process(desc);
   f.create_filter();
   return f;
 }
@@ -547,13 +554,17 @@ FilterGraph get_video_filter_graph(
     AVRational rate,
     int width,
     int height,
+    const c10::optional<std::string>& filter_desc,
     AVPixelFormat enc_fmt,
     bool is_cuda) {
   auto desc = [&]() -> std::string {
     if (src_fmt == enc_fmt || is_cuda) {
-      return "null";
+      return filter_desc.value_or("null");
     } else {
       std::stringstream ss;
+      if (filter_desc) {
+        ss << filter_desc.value() << ",";
+      }
       ss << "format=" << av_get_pix_fmt_name(enc_fmt);
       return ss.str();
     }
@@ -624,7 +635,8 @@ EncodeProcess get_audio_encode_process(
     const c10::optional<std::string>& encoder,
     const c10::optional<OptionDict>& encoder_option,
     const c10::optional<std::string>& encoder_format,
-    const c10::optional<CodecConfig>& codec_config) {
+    const c10::optional<CodecConfig>& codec_config,
+    const c10::optional<std::string>& filter_desc) {
   // 1. Check the source format, rate and channels
   const AVSampleFormat src_fmt = get_sample_fmt(format);
   TORCH_CHECK(
@@ -663,7 +675,12 @@ EncodeProcess get_audio_encode_process(
 
   // 5. Build filter graph
   FilterGraph filter_graph = get_audio_filter_graph(
-      src_fmt, src_sample_rate, channel_layout, enc_fmt, codec_ctx->frame_size);
+      src_fmt,
+      src_sample_rate,
+      channel_layout,
+      filter_desc,
+      enc_fmt,
+      codec_ctx->frame_size);
 
   // 6. Instantiate source frame
   AVFramePtr src_frame = get_audio_frame(
@@ -701,7 +718,8 @@ EncodeProcess get_video_encode_process(
     const c10::optional<OptionDict>& encoder_option,
     const c10::optional<std::string>& encoder_format,
     const c10::optional<std::string>& hw_accel,
-    const c10::optional<CodecConfig>& codec_config) {
+    const c10::optional<CodecConfig>& codec_config,
+    const c10::optional<std::string>& filter_desc) {
   // 1. Checkc the source format, rate and resolution
   const AVPixelFormat src_fmt = get_pix_fmt(format);
   AVRational src_rate = av_d2q(frame_rate, 1 << 24);
@@ -742,7 +760,13 @@ EncodeProcess get_video_encode_process(
 
   // 5. Build filter graph
   FilterGraph filter_graph = get_video_filter_graph(
-      src_fmt, src_rate, src_width, src_height, enc_fmt, hw_accel.has_value());
+      src_fmt,
+      src_rate,
+      src_width,
+      src_height,
+      filter_desc,
+      enc_fmt,
+      hw_accel.has_value());
 
   // 6. Instantiate source frame
   AVFramePtr src_frame = [&]() {
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/encode_process.h b/torchaudio/csrc/ffmpeg/stream_writer/encode_process.h
@@ -41,7 +41,8 @@ EncodeProcess get_audio_encode_process(
     const c10::optional<std::string>& encoder,
     const c10::optional<OptionDict>& encoder_option,
     const c10::optional<std::string>& encoder_format,
-    const c10::optional<CodecConfig>& codec_config);
+    const c10::optional<CodecConfig>& codec_config,
+    const c10::optional<std::string>& filter_desc);
 
 EncodeProcess get_video_encode_process(
     AVFormatContext* format_ctx,
@@ -53,6 +54,7 @@ EncodeProcess get_video_encode_process(
     const c10::optional<OptionDict>& encoder_option,
     const c10::optional<std::string>& encoder_format,
     const c10::optional<std::string>& hw_accel,
-    const c10::optional<CodecConfig>& codec_config);
+    const c10::optional<CodecConfig>& codec_config,
+    const c10::optional<std::string>& filter_desc);
 
 }; // namespace torchaudio::io
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
@@ -60,7 +60,8 @@ void StreamWriter::add_audio_stream(
     const c10::optional<std::string>& encoder,
     const c10::optional<OptionDict>& encoder_option,
     const c10::optional<std::string>& encoder_format,
-    const c10::optional<CodecConfig>& codec_config) {
+    const c10::optional<CodecConfig>& codec_config,
+    const c10::optional<std::string>& filter_desc) {
   TORCH_CHECK(!is_open, "Output is already opened. Cannot add a new stream.");
   TORCH_INTERNAL_ASSERT(
       pFormatContext->nb_streams == processes.size(),
@@ -73,7 +74,8 @@ void StreamWriter::add_audio_stream(
       encoder,
       encoder_option,
       encoder_format,
-      codec_config));
+      codec_config,
+      filter_desc));
 }
 
 void StreamWriter::add_video_stream(
@@ -85,7 +87,8 @@ void StreamWriter::add_video_stream(
     const c10::optional<OptionDict>& encoder_option,
     const c10::optional<std::string>& encoder_format,
     const c10::optional<std::string>& hw_accel,
-    const c10::optional<CodecConfig>& codec_config) {
+    const c10::optional<CodecConfig>& codec_config,
+    const c10::optional<std::string>& filter_desc) {
   TORCH_CHECK(!is_open, "Output is already opened. Cannot add a new stream.");
   TORCH_INTERNAL_ASSERT(
       pFormatContext->nb_streams == processes.size(),
@@ -100,7 +103,8 @@ void StreamWriter::add_video_stream(
       encoder_option,
       encoder_format,
       hw_accel,
-      codec_config));
+      codec_config,
+      filter_desc));
 }
 
 void StreamWriter::set_metadata(const OptionDict& metadata) {
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h
@@ -100,14 +100,17 @@ class StreamWriter {
   ///  To list supported formats for the encoder, you can use
   /// ``ffmpeg -h encoder=<ENCODER>`` command.
   /// @param codec_config Codec configuration.
+  /// @param filter_desc Additional processing to apply before
+  /// encoding the input data
   void add_audio_stream(
       int sample_rate,
       int num_channels,
       const std::string& format,
       const c10::optional<std::string>& encoder,
       const c10::optional<OptionDict>& encoder_option,
       const c10::optional<std::string>& encoder_format,
-      const c10::optional<CodecConfig>& codec_config);
+      const c10::optional<CodecConfig>& codec_config,
+      const c10::optional<std::string>& filter_desc);
 
   /// Add an output video stream.
   ///
@@ -139,6 +142,8 @@ class StreamWriter {
   ///
   /// If `None`, the video chunk Tensor has to be a CPU Tensor.
   /// @endparblock
+  /// @param filter_desc Additional processing to apply before
+  /// encoding the input data
   void add_video_stream(
       double frame_rate,
       int width,
@@ -148,7 +153,8 @@ class StreamWriter {
       const c10::optional<OptionDict>& encoder_option,
       const c10::optional<std::string>& encoder_format,
       const c10::optional<std::string>& hw_accel,
-      const c10::optional<CodecConfig>& codec_config);
+      const c10::optional<CodecConfig>& codec_config,
+      const c10::optional<std::string>& filter_desc);
   /// Set file-level metadata
   /// @param metadata metadata.
   void set_metadata(const OptionDict& metadata);
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp b/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp
@@ -144,9 +144,12 @@ void write_interlaced_video(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(2) == buffer->width);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3) == num_channels);
 
-  // TODO: writable
   // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
-  TORCH_INTERNAL_ASSERT(av_frame_is_writable(buffer), "frame is not writable.");
+  if (!av_frame_is_writable(buffer)) {
+    int ret = av_frame_make_writable(buffer);
+    TORCH_INTERNAL_ASSERT(
+        ret >= 0, "Failed to make frame writable: ", av_err2string(ret));
+  }
 
   size_t stride = buffer->width * num_channels;
   uint8_t* src = frame.data_ptr<uint8_t>();
@@ -191,9 +194,12 @@ void write_planar_video(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(2), buffer->height);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3), buffer->width);
 
-  // TODO: writable
   // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
-  TORCH_INTERNAL_ASSERT(av_frame_is_writable(buffer), "frame is not writable.");
+  if (!av_frame_is_writable(buffer)) {
+    int ret = av_frame_make_writable(buffer);
+    TORCH_INTERNAL_ASSERT(
+        ret >= 0, "Failed to make frame writable: ", av_err2string(ret));
+  }
 
   for (int j = 0; j < num_colors; ++j) {
     uint8_t* src = frame.index({0, j}).data_ptr<uint8_t>();
diff --git a/torchaudio/io/_stream_writer.py b/torchaudio/io/_stream_writer.py
@@ -53,11 +53,15 @@ def decorator(obj):
                 Default: ``None``."""
 
 
+_filter_desc = """Additional processing to apply before encoding the input media.
+                """
+
 _format_common_args = _format_doc(
     encoder=_encoder,
     encoder_option=_encoder_option,
     encoder_format=_encoder_format,
     codec_config=_codec_config,
+    filter_desc=_filter_desc,
 )
 
 
@@ -159,6 +163,7 @@ def add_audio_stream(
         encoder_option: Optional[Dict[str, str]] = None,
         encoder_format: Optional[str] = None,
         codec_config: Optional[CodecConfig] = None,
+        filter_desc: Optional[str] = None,
     ):
         """Add an output audio stream.
 
@@ -186,9 +191,11 @@ def add_audio_stream(
             encoder_format (str or None, optional): {encoder_format}
 
             codec_config (CodecConfig or None, optional): {codec_config}
+
+            filter_desc (str or None, optional): {filter_desc}
         """
         self._s.add_audio_stream(
-            sample_rate, num_channels, format, encoder, encoder_option, encoder_format, codec_config
+            sample_rate, num_channels, format, encoder, encoder_option, encoder_format, codec_config, filter_desc
         )
 
     @_format_common_args
@@ -203,6 +210,7 @@ def add_video_stream(
         encoder_format: Optional[str] = None,
         hw_accel: Optional[str] = None,
         codec_config: Optional[CodecConfig] = None,
+        filter_desc: Optional[str] = None,
     ):
         """Add an output video stream.
 
@@ -245,9 +253,20 @@ def add_video_stream(
                 Default: ``None``.
 
             codec_config (CodecConfig or None, optional): {codec_config}
+
+            filter_desc (str or None, optional): {filter_desc}
         """
         self._s.add_video_stream(
-            frame_rate, width, height, format, encoder, encoder_option, encoder_format, hw_accel, codec_config
+            frame_rate,
+            width,
+            height,
+            format,
+            encoder,
+            encoder_option,
+            encoder_format,
+            hw_accel,
+            codec_config,
+            filter_desc,
         )
 
     def set_metadata(self, metadata: Dict[str, str]):