Skip to content

Commit c838c12

Browse files
committed
Improvements to AudioWaveformer to use RMS (Root Mean Square), and return both average RMS and max RMS for graphing, including some new unit tests.
1 parent 9cd7dd6 commit c838c12

File tree

5 files changed

+153
-69
lines changed

5 files changed

+153
-69
lines changed

bindings/python/openshot.i

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
%template() std::pair<float, float>;
5151
%template() std::pair<std::string, std::string>;
5252
%template() std::vector<std::pair<std::string, std::string>>;
53+
%template() std::vector<std::vector<float>>;
5354

5455
%{
5556
#include "OpenShotVersion.h"

bindings/ruby/openshot.i

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
%template() std::pair<float, float>;
5151
%template() std::pair<std::string, std::string>;
5252
%template() std::vector<std::pair<std::string, std::string>>;
53+
%template() std::vector<std::vector<float>>;
5354

5455
%{
5556
/* Ruby and FFmpeg define competing RSHIFT macros,

src/AudioWaveformer.cpp

Lines changed: 36 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ AudioWaveformer::~AudioWaveformer()
3030
}
3131

3232
// Extract audio samples from any ReaderBase class
33-
std::vector<float> AudioWaveformer::ExtractSamples(int channel, int num_per_second, bool normalize) {
34-
std::vector<float> extracted_data(0);
33+
AudioWaveformData AudioWaveformer::ExtractSamples(int channel, int num_per_second, bool normalize) {
34+
AudioWaveformData data;
3535

3636
if (reader) {
3737
// Open reader (if needed)
@@ -45,60 +45,73 @@ std::vector<float> AudioWaveformer::ExtractSamples(int channel, int num_per_seco
4545
int sample_rate = reader->info.sample_rate;
4646
int sample_divisor = sample_rate / num_per_second;
4747
int total_samples = num_per_second * (reader->info.duration + 1.0);
48-
49-
// Size audio buffer (for smaller dataset)
50-
extracted_data.resize(total_samples);
5148
int extracted_index = 0;
5249

53-
// Clear audio buffer
54-
for (auto s = 0; s < total_samples; s++) {
55-
extracted_data[s] = 0.0;
56-
}
50+
// Resize and clear audio buffers
51+
data.resize(total_samples);
52+
data.zero(total_samples);
5753

5854
// Loop through all frames
5955
int sample_index = 0;
60-
float samples_total = 0.0;
6156
float samples_max = 0.0;
62-
float samples_min = 0.0;
57+
float chunk_max = 0.0;
58+
float chunk_squared_sum = 0.0;
59+
60+
// How many channels are we using
61+
int channel_count = 1;
62+
if (channel == -1) {
63+
channel_count = reader->info.channels;
64+
}
6365

6466
for (auto f = 1; f <= reader->info.video_length; f++) {
6567
// Get next frame
6668
shared_ptr<openshot::Frame> frame = reader->GetFrame(f);
6769

68-
float* samples = frame->GetAudioSamples(channel);
70+
// Get sample value from a specific channel (or all channels)
6971
for (auto s = 0; s < frame->GetAudioSamplesCount(); s++) {
70-
samples_total += samples[s];
72+
73+
for (auto channel_index = 0; channel_index < reader->info.channels; channel_index++) {
74+
if (channel == channel_index || channel == -1) {
75+
float *samples = frame->GetAudioSamples(channel_index);
76+
float rms_sample_value = std::sqrt(samples[s] * samples[s]);
77+
78+
// Accumulate sample averages
79+
chunk_squared_sum += rms_sample_value;
80+
chunk_max = std::max(chunk_max, rms_sample_value);
81+
}
82+
}
83+
7184
sample_index += 1;
7285

7386
// Cut-off reached
7487
if (sample_index % sample_divisor == 0) {
75-
float avg_sample_value = samples_total / sample_divisor;
76-
extracted_data[extracted_index] = avg_sample_value;
88+
float avg_squared_sum = chunk_squared_sum / (sample_divisor * channel_count);
89+
data.max_samples[extracted_index] = chunk_max;
90+
data.rms_samples[extracted_index] = avg_squared_sum;
7791
extracted_index++;
7892

7993
// Track max/min values
80-
samples_max = std::max(samples_max, avg_sample_value);
81-
samples_min = std::min(samples_min, avg_sample_value);
94+
samples_max = std::max(samples_max, chunk_max);
8295

8396
// reset sample total and index
8497
sample_index = 0;
85-
samples_total = 0.0;
98+
chunk_max = 0.0;
99+
chunk_squared_sum = 0.0;
86100
}
87101
}
88102
}
89103

90104
// Scale all values to the -1 to +1 range (regardless of how small or how large the
91105
// original audio sample values are)
92106
if (normalize) {
93-
float scale = std::min(1.0f / samples_max, 1.0f / std::fabs(samples_min));
94-
for (auto s = 0; s < total_samples; s++) {
95-
extracted_data[s] *= scale;
96-
}
107+
float scale = 1.0f / samples_max;
108+
data.scale(total_samples, scale);
97109
}
98110

99111
// Resume previous has_video value
100112
reader->info.has_video = does_reader_have_video;
101113
}
102114

103-
return extracted_data;
115+
116+
return data;
104117
}

src/AudioWaveformer.h

Lines changed: 67 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,31 +20,81 @@
2020

2121
namespace openshot {
2222

23-
/**
24-
* @brief This class is used to extra audio data used for generating waveforms.
25-
*
26-
* Pass in a ReaderBase* with audio data, and this class will iterate the reader,
27-
* and sample down the dataset to a much smaller set - more useful for generating
28-
* waveforms. For example, take 44100 samples per second, and reduce it to 20
29-
* "average" samples per second - much easier to graph.
30-
*/
31-
class AudioWaveformer {
32-
private:
23+
/**
24+
* @brief This struct holds the extracted waveform data (both the RMS root-mean-squared average, and the max values)
25+
*
26+
* Because we extract 2 different datasets from the audio, we return this struct with access to both sets of data,
27+
* the average root mean squared values, and the max sample values.
28+
*/
29+
struct AudioWaveformData
30+
{
31+
std::vector<float> max_samples;
32+
std::vector<float> rms_samples;
33+
34+
/// Resize both datasets
35+
void resize(int total_samples) {
36+
max_samples.resize(total_samples);
37+
rms_samples.resize(total_samples);
38+
}
39+
40+
/// Zero out # of values in both datasets
41+
void zero(int total_samples) {
42+
for (auto s = 0; s < total_samples; s++) {
43+
max_samples[s] = 0.0;
44+
rms_samples[s] = 0.0;
45+
}
46+
}
47+
48+
/// Scale # of values by some factor
49+
void scale(int total_samples, float factor) {
50+
for (auto s = 0; s < total_samples; s++) {
51+
max_samples[s] *= factor;
52+
rms_samples[s] *= factor;
53+
}
54+
}
55+
56+
/// Clear and free memory of both datasets
57+
void clear() {
58+
max_samples.clear();
59+
max_samples.shrink_to_fit();
60+
rms_samples.clear();
61+
rms_samples.shrink_to_fit();
62+
}
63+
64+
/// Return a vector of vectors (containing both datasets)
65+
std::vector<std::vector<float>> vectors() {
66+
std::vector<std::vector<float>> output;
67+
output.push_back(max_samples);
68+
output.push_back(rms_samples);
69+
return output;
70+
}
71+
};
72+
73+
/**
74+
* @brief This class is used to extra audio data used for generating waveforms.
75+
*
76+
* Pass in a ReaderBase* with audio data, and this class will iterate the reader,
77+
* and sample down the dataset to a much smaller set - more useful for generating
78+
* waveforms. For example, take 44100 samples per second, and reduce it to 20
79+
* "max" or "average" samples per second - much easier to graph.
80+
*/
81+
class AudioWaveformer {
82+
private:
3383
ReaderBase* reader;
3484

35-
public:
36-
/// Default constructor
85+
public:
86+
/// Default constructor
3787
AudioWaveformer(ReaderBase* reader);
3888

3989
/// @brief Extract audio samples from any ReaderBase class
40-
/// @param channel Which audio channel should we extract data from
90+
/// @param channel Which audio channel should we extract data from (-1 == all channels)
4191
/// @param num_per_second How many samples per second to return
4292
/// @param normalize Should we scale the data range so the largest value is 1.0
43-
std::vector<float> ExtractSamples(int channel, int num_per_second, bool normalize);
93+
AudioWaveformData ExtractSamples(int channel, int num_per_second, bool normalize);
4494

45-
/// Destructor
46-
~AudioWaveformer();
47-
};
95+
/// Destructor
96+
~AudioWaveformer();
97+
};
4898

4999
}
50100

tests/AudioWaveformer.cpp

Lines changed: 48 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* @ref License
77
*/
88

9-
// Copyright (c) 2008-2019 OpenShot Studios, LLC
9+
// Copyright (c) 2008-2022 OpenShot Studios, LLC
1010
//
1111
// SPDX-License-Identifier: LGPL-3.0-or-later
1212

@@ -28,22 +28,18 @@ TEST_CASE( "Extract waveform data piano.wav", "[libopenshot][audiowaveformer]" )
2828
// Create AudioWaveformer and extract a smaller "average" sample set of audio data
2929
AudioWaveformer waveformer(&r);
3030
for (auto channel = 0; channel < r.info.channels; channel++) {
31-
std::vector<float> waveform = waveformer.ExtractSamples(channel, 20, false);
31+
AudioWaveformData waveform = waveformer.ExtractSamples(channel, 20, false);
3232

3333
if (channel == 0) {
34-
CHECK(waveform.size() == 107);
35-
CHECK(waveform[0] == Approx(0.000820312474f).margin(0.00001));
36-
CHECK(waveform[86] == Approx(-0.00144531252f).margin(0.00001));
37-
CHECK(waveform[87] == Approx(0.0f).margin(0.00001));
38-
39-
for (auto sample = 0; sample < waveform.size(); sample++) {
40-
std::cout << waveform[sample] << std::endl;
41-
}
34+
CHECK(waveform.rms_samples.size() == 107);
35+
CHECK(waveform.rms_samples[0] == Approx(0.04879f).margin(0.00001));
36+
CHECK(waveform.rms_samples[86] == Approx(0.13578f).margin(0.00001));
37+
CHECK(waveform.rms_samples[87] == Approx(0.0f).margin(0.00001));
4238
} else if (channel == 1) {
43-
CHECK(waveform.size() == 107);
44-
CHECK(waveform[0] == Approx(0.000820312474f).margin(0.00001));
45-
CHECK(waveform[86] == Approx(-0.00144531252f).margin(0.00001));
46-
CHECK(waveform[87] == Approx(0.0f).margin(0.00001));
39+
CHECK(waveform.rms_samples.size() == 107);
40+
CHECK(waveform.rms_samples[0] == Approx(0.04879f).margin(0.00001));
41+
CHECK(waveform.rms_samples[86] == Approx(0.13578f).margin(0.00001));
42+
CHECK(waveform.rms_samples[87] == Approx(0.0f).margin(0.00001));
4743
}
4844

4945
waveform.clear();
@@ -63,18 +59,18 @@ TEST_CASE( "Extract waveform data sintel", "[libopenshot][audiowaveformer]" )
6359
// Create AudioWaveformer and extract a smaller "average" sample set of audio data
6460
AudioWaveformer waveformer(&r);
6561
for (auto channel = 0; channel < r.info.channels; channel++) {
66-
std::vector<float> waveform = waveformer.ExtractSamples(channel, 20, false);
62+
AudioWaveformData waveform = waveformer.ExtractSamples(channel, 20, false);
6763

6864
if (channel == 0) {
69-
CHECK(waveform.size() == 1058);
70-
CHECK(waveform[0] == Approx(-1.48391728e-05f).margin(0.00001));
71-
CHECK(waveform[1037] == Approx(6.79016102e-06f).margin(0.00001));
72-
CHECK(waveform[1038] == Approx(0.0f).margin(0.00001));
65+
CHECK(waveform.rms_samples.size() == 1058);
66+
CHECK(waveform.rms_samples[0] == Approx(0.00001f).margin(0.00001));
67+
CHECK(waveform.rms_samples[1037] == Approx(0.00003f).margin(0.00001));
68+
CHECK(waveform.rms_samples[1038] == Approx(0.0f).margin(0.00001));
7369
} else if (channel == 1) {
74-
CHECK(waveform.size() == 1058);
75-
CHECK(waveform[0] == Approx(-1.43432617e-05f).margin(0.00001));
76-
CHECK(waveform[1037] == Approx(6.79016102e-06f).margin(0.00001));
77-
CHECK(waveform[1038] == Approx(0.0f).margin(0.00001));
70+
CHECK(waveform.rms_samples.size() == 1058);
71+
CHECK(waveform.rms_samples[0] == Approx(0.00001f ).margin(0.00001));
72+
CHECK(waveform.rms_samples[1037] == Approx(0.00003f).margin(0.00001));
73+
CHECK(waveform.rms_samples[1038] == Approx(0.0f).margin(0.00001));
7874
}
7975

8076
waveform.clear();
@@ -84,6 +80,29 @@ TEST_CASE( "Extract waveform data sintel", "[libopenshot][audiowaveformer]" )
8480
r.Close();
8581
}
8682

83+
84+
TEST_CASE( "Extract waveform data sintel (all channels)", "[libopenshot][audiowaveformer]" )
85+
{
86+
// Create a reader
87+
std::stringstream path;
88+
path << TEST_MEDIA_PATH << "sintel_trailer-720p.mp4";
89+
FFmpegReader r(path.str());
90+
91+
// Create AudioWaveformer and extract a smaller "average" sample set of audio data
92+
AudioWaveformer waveformer(&r);
93+
AudioWaveformData waveform = waveformer.ExtractSamples(-1, 20, false);
94+
95+
CHECK(waveform.rms_samples.size() == 1058);
96+
CHECK(waveform.rms_samples[0] == Approx(0.00001f).margin(0.00001));
97+
CHECK(waveform.rms_samples[1037] == Approx(0.00003f).margin(0.00001));
98+
CHECK(waveform.rms_samples[1038] == Approx(0.0f).margin(0.00001));
99+
100+
waveform.clear();
101+
102+
// Clean up
103+
r.Close();
104+
}
105+
87106
TEST_CASE( "Normalize & scale waveform data piano.wav", "[libopenshot][audiowaveformer]" )
88107
{
89108
// Create a reader
@@ -95,14 +114,14 @@ TEST_CASE( "Normalize & scale waveform data piano.wav", "[libopenshot][audiowave
95114
AudioWaveformer waveformer(&r);
96115
for (auto channel = 0; channel < r.info.channels; channel++) {
97116
// Normalize values and scale them between -1 and +1
98-
std::vector<float> waveform = waveformer.ExtractSamples(channel, 20, true);
117+
AudioWaveformData waveform = waveformer.ExtractSamples(channel, 20, true);
99118

100119
if (channel == 0) {
101-
CHECK(waveform.size() == 107);
102-
CHECK(waveform[0] == Approx(0.113821134).margin(0.00001));
103-
CHECK(waveform[35] == Approx(-1.0f).margin(0.00001));
104-
CHECK(waveform[86] == Approx(-0.200542003f).margin(0.00001));
105-
CHECK(waveform[87] == Approx(0.0f).margin(0.00001));
120+
CHECK(waveform.rms_samples.size() == 107);
121+
CHECK(waveform.rms_samples[0] == Approx(0.07524f).margin(0.00001));
122+
CHECK(waveform.rms_samples[35] == Approx(0.20063f).margin(0.00001));
123+
CHECK(waveform.rms_samples[86] == Approx(0.2094f).margin(0.00001));
124+
CHECK(waveform.rms_samples[87] == Approx(0.0f).margin(0.00001));
106125
}
107126

108127
waveform.clear();

0 commit comments

Comments
 (0)