Improvements to AudioWaveformer to use RMS (Root Mean Square), and return both average RMS and max RMS for graphing, including some new unit tests.

jonoomph · jonoomph · commit c838c126ad73 · 2022-11-01T15:17:03.000-05:00
diff --git a/bindings/python/openshot.i b/bindings/python/openshot.i
@@ -50,6 +50,7 @@
 %template() std::pair<float, float>;
 %template() std::pair<std::string, std::string>;
 %template() std::vector<std::pair<std::string, std::string>>;
+%template() std::vector<std::vector<float>>;
 
 %{
 #include "OpenShotVersion.h"
diff --git a/bindings/ruby/openshot.i b/bindings/ruby/openshot.i
@@ -50,6 +50,7 @@
 %template() std::pair<float, float>;
 %template() std::pair<std::string, std::string>;
 %template() std::vector<std::pair<std::string, std::string>>;
+%template() std::vector<std::vector<float>>;
 
 %{
 /* Ruby and FFmpeg define competing RSHIFT macros,
diff --git a/src/AudioWaveformer.cpp b/src/AudioWaveformer.cpp
@@ -30,8 +30,8 @@ AudioWaveformer::~AudioWaveformer()
 }
 
 // Extract audio samples from any ReaderBase class
-std::vector<float> AudioWaveformer::ExtractSamples(int channel, int num_per_second, bool normalize) {
-    std::vector<float> extracted_data(0);
+AudioWaveformData AudioWaveformer::ExtractSamples(int channel, int num_per_second, bool normalize) {
+    AudioWaveformData data;
 
     if (reader) {
         // Open reader (if needed)
@@ -45,60 +45,73 @@ std::vector<float> AudioWaveformer::ExtractSamples(int channel, int num_per_seco
         int sample_rate = reader->info.sample_rate;
         int sample_divisor = sample_rate / num_per_second;
         int total_samples = num_per_second * (reader->info.duration + 1.0);
-
-        // Size audio buffer (for smaller dataset)
-        extracted_data.resize(total_samples);
         int extracted_index = 0;
 
-        // Clear audio buffer
-        for (auto s = 0; s < total_samples; s++) {
-            extracted_data[s] = 0.0;
-        }
+        // Resize and clear audio buffers
+        data.resize(total_samples);
+        data.zero(total_samples);
 
         // Loop through all frames
         int sample_index = 0;
-        float samples_total = 0.0;
         float samples_max = 0.0;
-        float samples_min = 0.0;
+        float chunk_max = 0.0;
+        float chunk_squared_sum = 0.0;
+
+        // How many channels are we using
+        int channel_count = 1;
+        if (channel == -1) {
+            channel_count = reader->info.channels;
+        }
 
         for (auto f = 1; f <= reader->info.video_length; f++) {
             // Get next frame
             shared_ptr<openshot::Frame> frame = reader->GetFrame(f);
 
-            float* samples = frame->GetAudioSamples(channel);
+            // Get sample value from a specific channel (or all channels)
             for (auto s = 0; s < frame->GetAudioSamplesCount(); s++) {
-                samples_total += samples[s];
+
+                for (auto channel_index = 0; channel_index < reader->info.channels; channel_index++) {
+                    if (channel == channel_index || channel == -1) {
+                        float *samples = frame->GetAudioSamples(channel_index);
+                        float rms_sample_value = std::sqrt(samples[s] * samples[s]);
+
+                        // Accumulate sample averages
+                        chunk_squared_sum += rms_sample_value;
+                        chunk_max = std::max(chunk_max, rms_sample_value);
+                    }
+                }
+
                 sample_index += 1;
 
                 // Cut-off reached
                 if (sample_index % sample_divisor == 0) {
-                    float avg_sample_value = samples_total / sample_divisor;
-                    extracted_data[extracted_index] = avg_sample_value;
+                    float avg_squared_sum = chunk_squared_sum / (sample_divisor * channel_count);
+                    data.max_samples[extracted_index] = chunk_max;
+                    data.rms_samples[extracted_index] = avg_squared_sum;
                     extracted_index++;
 
                     // Track max/min values
-                    samples_max = std::max(samples_max, avg_sample_value);
-                    samples_min = std::min(samples_min, avg_sample_value);
+                    samples_max = std::max(samples_max, chunk_max);
 
                     // reset sample total and index
                     sample_index = 0;
-                    samples_total = 0.0;
+                    chunk_max = 0.0;
+                    chunk_squared_sum = 0.0;
                 }
             }
         }
 
         // Scale all values to the -1 to +1 range (regardless of how small or how large the
         // original audio sample values are)
         if (normalize) {
-            float scale = std::min(1.0f / samples_max, 1.0f / std::fabs(samples_min));
-            for (auto s = 0; s < total_samples; s++) {
-                extracted_data[s] *= scale;
-            }
+            float scale = 1.0f / samples_max;
+            data.scale(total_samples, scale);
         }
 
         // Resume previous has_video value
         reader->info.has_video = does_reader_have_video;
     }
 
-    return extracted_data;
+
+    return data;
 }
diff --git a/src/AudioWaveformer.h b/src/AudioWaveformer.h
@@ -20,31 +20,81 @@
 
 namespace openshot {
 
-	/**
-	 * @brief This class is used to extra audio data used for generating waveforms.
-	 *
-	 * Pass in a ReaderBase* with audio data, and this class will iterate the reader,
-	 * and sample down the dataset to a much smaller set - more useful for generating
-	 * waveforms. For example, take 44100 samples per second, and reduce it to 20
-	 * "average" samples per second - much easier to graph.
-	 */
-	class AudioWaveformer {
-	private:
+    /**
+     * @brief This struct holds the extracted waveform data (both the RMS root-mean-squared average, and the max values)
+     *
+     * Because we extract 2 different datasets from the audio, we return this struct with access to both sets of data,
+     * the average root mean squared values, and the max sample values.
+     */
+    struct AudioWaveformData
+    {
+        std::vector<float> max_samples;
+        std::vector<float> rms_samples;
+
+        /// Resize both datasets
+        void resize(int total_samples) {
+            max_samples.resize(total_samples);
+            rms_samples.resize(total_samples);
+        }
+
+        /// Zero out # of values in both datasets
+        void zero(int total_samples) {
+            for (auto s = 0; s < total_samples; s++) {
+                max_samples[s] = 0.0;
+                rms_samples[s] = 0.0;
+            }
+        }
+
+        /// Scale # of values by some factor
+        void scale(int total_samples, float factor) {
+            for (auto s = 0; s < total_samples; s++) {
+                max_samples[s] *= factor;
+                rms_samples[s] *= factor;
+            }
+        }
+
+        /// Clear and free memory of both datasets
+        void clear() {
+            max_samples.clear();
+            max_samples.shrink_to_fit();
+            rms_samples.clear();
+            rms_samples.shrink_to_fit();
+        }
+
+        /// Return a vector of vectors (containing both datasets)
+        std::vector<std::vector<float>> vectors() {
+            std::vector<std::vector<float>> output;
+            output.push_back(max_samples);
+            output.push_back(rms_samples);
+            return output;
+        }
+    };
+
+    /**
+     * @brief This class is used to extra audio data used for generating waveforms.
+     *
+     * Pass in a ReaderBase* with audio data, and this class will iterate the reader,
+     * and sample down the dataset to a much smaller set - more useful for generating
+     * waveforms. For example, take 44100 samples per second, and reduce it to 20
+     * "max" or "average" samples per second - much easier to graph.
+     */
+    class AudioWaveformer {
+    private:
         ReaderBase* reader;
 
-	public:
-		/// Default constructor
+    public:
+        /// Default constructor
         AudioWaveformer(ReaderBase* reader);
 
         /// @brief Extract audio samples from any ReaderBase class
-        /// @param channel Which audio channel should we extract data from
+        /// @param channel Which audio channel should we extract data from (-1 == all channels)
         /// @param num_per_second How many samples per second to return
         /// @param normalize Should we scale the data range so the largest value is 1.0
-		std::vector<float> ExtractSamples(int channel, int num_per_second, bool normalize);
+        AudioWaveformData ExtractSamples(int channel, int num_per_second, bool normalize);
 
-		/// Destructor
-		~AudioWaveformer();
-	};
+        /// Destructor
+        ~AudioWaveformer();
+    };
 
 }
 
diff --git a/tests/AudioWaveformer.cpp b/tests/AudioWaveformer.cpp
@@ -6,7 +6,7 @@
  * @ref License
  */
 
-// Copyright (c) 2008-2019 OpenShot Studios, LLC
+// Copyright (c) 2008-2022 OpenShot Studios, LLC
 //
 // SPDX-License-Identifier: LGPL-3.0-or-later
 
@@ -28,22 +28,18 @@ TEST_CASE( "Extract waveform data piano.wav", "[libopenshot][audiowaveformer]" )
     // Create AudioWaveformer and extract a smaller "average" sample set of audio data
     AudioWaveformer waveformer(&r);
     for (auto channel = 0; channel < r.info.channels; channel++) {
-        std::vector<float> waveform = waveformer.ExtractSamples(channel, 20, false);
+        AudioWaveformData waveform = waveformer.ExtractSamples(channel, 20, false);
 
         if (channel == 0) {
-            CHECK(waveform.size() == 107);
-            CHECK(waveform[0] == Approx(0.000820312474f).margin(0.00001));
-            CHECK(waveform[86] == Approx(-0.00144531252f).margin(0.00001));
-            CHECK(waveform[87] == Approx(0.0f).margin(0.00001));
-
-            for (auto sample = 0; sample < waveform.size(); sample++) {
-                std::cout << waveform[sample] << std::endl;
-            }
+            CHECK(waveform.rms_samples.size() == 107);
+            CHECK(waveform.rms_samples[0] == Approx(0.04879f).margin(0.00001));
+            CHECK(waveform.rms_samples[86] == Approx(0.13578f).margin(0.00001));
+            CHECK(waveform.rms_samples[87] == Approx(0.0f).margin(0.00001));
         } else if (channel == 1) {
-            CHECK(waveform.size() == 107);
-            CHECK(waveform[0] == Approx(0.000820312474f).margin(0.00001));
-            CHECK(waveform[86] == Approx(-0.00144531252f).margin(0.00001));
-            CHECK(waveform[87] == Approx(0.0f).margin(0.00001));
+            CHECK(waveform.rms_samples.size() == 107);
+            CHECK(waveform.rms_samples[0] == Approx(0.04879f).margin(0.00001));
+            CHECK(waveform.rms_samples[86] == Approx(0.13578f).margin(0.00001));
+            CHECK(waveform.rms_samples[87] == Approx(0.0f).margin(0.00001));
         }
 
         waveform.clear();
@@ -63,18 +59,18 @@ TEST_CASE( "Extract waveform data sintel", "[libopenshot][audiowaveformer]" )
     // Create AudioWaveformer and extract a smaller "average" sample set of audio data
     AudioWaveformer waveformer(&r);
     for (auto channel = 0; channel < r.info.channels; channel++) {
-        std::vector<float> waveform = waveformer.ExtractSamples(channel, 20, false);
+        AudioWaveformData waveform = waveformer.ExtractSamples(channel, 20, false);
 
         if (channel == 0) {
-            CHECK(waveform.size() == 1058);
-            CHECK(waveform[0] == Approx(-1.48391728e-05f).margin(0.00001));
-            CHECK(waveform[1037] == Approx(6.79016102e-06f).margin(0.00001));
-            CHECK(waveform[1038] == Approx(0.0f).margin(0.00001));
+            CHECK(waveform.rms_samples.size() == 1058);
+            CHECK(waveform.rms_samples[0] == Approx(0.00001f).margin(0.00001));
+            CHECK(waveform.rms_samples[1037] == Approx(0.00003f).margin(0.00001));
+            CHECK(waveform.rms_samples[1038] == Approx(0.0f).margin(0.00001));
         } else if (channel == 1) {
-            CHECK(waveform.size() == 1058);
-            CHECK(waveform[0] == Approx(-1.43432617e-05f).margin(0.00001));
-            CHECK(waveform[1037] == Approx(6.79016102e-06f).margin(0.00001));
-            CHECK(waveform[1038] == Approx(0.0f).margin(0.00001));
+            CHECK(waveform.rms_samples.size() == 1058);
+            CHECK(waveform.rms_samples[0] == Approx(0.00001f ).margin(0.00001));
+            CHECK(waveform.rms_samples[1037] == Approx(0.00003f).margin(0.00001));
+            CHECK(waveform.rms_samples[1038] == Approx(0.0f).margin(0.00001));
         }
 
         waveform.clear();
@@ -84,6 +80,29 @@ TEST_CASE( "Extract waveform data sintel", "[libopenshot][audiowaveformer]" )
     r.Close();
 }
 
+
+TEST_CASE( "Extract waveform data sintel (all channels)", "[libopenshot][audiowaveformer]" )
+{
+    // Create a reader
+    std::stringstream path;
+    path << TEST_MEDIA_PATH << "sintel_trailer-720p.mp4";
+    FFmpegReader r(path.str());
+
+    // Create AudioWaveformer and extract a smaller "average" sample set of audio data
+    AudioWaveformer waveformer(&r);
+    AudioWaveformData waveform = waveformer.ExtractSamples(-1, 20, false);
+
+    CHECK(waveform.rms_samples.size() == 1058);
+    CHECK(waveform.rms_samples[0] == Approx(0.00001f).margin(0.00001));
+    CHECK(waveform.rms_samples[1037] == Approx(0.00003f).margin(0.00001));
+    CHECK(waveform.rms_samples[1038] == Approx(0.0f).margin(0.00001));
+
+    waveform.clear();
+
+    // Clean up
+    r.Close();
+}
+
 TEST_CASE( "Normalize & scale waveform data piano.wav", "[libopenshot][audiowaveformer]" )
 {
     // Create a reader
@@ -95,14 +114,14 @@ TEST_CASE( "Normalize & scale waveform data piano.wav", "[libopenshot][audiowave
     AudioWaveformer waveformer(&r);
     for (auto channel = 0; channel < r.info.channels; channel++) {
         // Normalize values and scale them between -1 and +1
-        std::vector<float> waveform = waveformer.ExtractSamples(channel, 20, true);
+        AudioWaveformData waveform = waveformer.ExtractSamples(channel, 20, true);
 
         if (channel == 0) {
-            CHECK(waveform.size() == 107);
-            CHECK(waveform[0] == Approx(0.113821134).margin(0.00001));
-            CHECK(waveform[35] == Approx(-1.0f).margin(0.00001));
-            CHECK(waveform[86] == Approx(-0.200542003f).margin(0.00001));
-            CHECK(waveform[87] == Approx(0.0f).margin(0.00001));
+            CHECK(waveform.rms_samples.size() == 107);
+            CHECK(waveform.rms_samples[0] == Approx(0.07524f).margin(0.00001));
+            CHECK(waveform.rms_samples[35] == Approx(0.20063f).margin(0.00001));
+            CHECK(waveform.rms_samples[86] == Approx(0.2094f).margin(0.00001));
+            CHECK(waveform.rms_samples[87] == Approx(0.0f).margin(0.00001));
         }
 
         waveform.clear();