OpenShot Library | libopenshot  0.7.0
AudioWaveformer.cpp
Go to the documentation of this file.
1 
9 // Copyright (c) 2008-2022 OpenShot Studios, LLC
10 //
11 // SPDX-License-Identifier: LGPL-3.0-or-later
12 
13 #include "AudioWaveformer.h"
14 
15 #include <cmath>
16 
17 #include <algorithm>
18 #include <chrono>
19 #include <memory>
20 #include <thread>
21 #include <vector>
22 
23 #include "Clip.h"
24 #include "Exceptions.h"
25 #include "FrameMapper.h"
26 #include "FFmpegReader.h"
27 #include "Timeline.h"
28 
29 
30 using namespace std;
31 using namespace openshot;
32 
33 
34 // Default constructor
35 AudioWaveformer::AudioWaveformer(ReaderBase* new_reader) :
36  reader(new_reader),
37  detached_reader(nullptr),
38  resolved_reader(nullptr),
39  source_initialized(false)
40 {
41 
42 }
43 
44 // Destructor
46 {
47 
48 }
49 
50 // Extract audio samples from any ReaderBase class
51 AudioWaveformData AudioWaveformer::ExtractSamples(int channel, int num_per_second, bool normalize) {
52  // Legacy entry point: resolve a source reader (unwrap Clip/FrameMapper), then extract audio-only.
53  AudioWaveformData data;
54  if (!reader) {
55  return data;
56  }
57 
58  ReaderBase* source = ResolveWaveformReader();
59 
60  Fraction source_fps = ResolveSourceFPS(source);
61 
62  AudioWaveformData base = ExtractSamplesFromReader(source, channel, num_per_second, false);
63 
64  // If this is a Clip, apply its keyframes using project fps (timeline if available, else reader fps)
65  if (auto clip = dynamic_cast<Clip*>(reader)) {
66  Timeline* timeline = dynamic_cast<Timeline*>(clip->ParentTimeline());
67  Fraction project_fps = timeline ? timeline->info.fps : clip->Reader()->info.fps;
68  return ApplyKeyframes(base, &clip->time, &clip->volume, project_fps, source_fps, source->info.channels, num_per_second, channel, normalize);
69  }
70 
71  // No keyframes to apply
72  if (normalize) {
73  float max_sample = 0.0f;
74  for (auto v : base.max_samples) {
75  max_sample = std::max(max_sample, std::abs(v));
76  }
77  if (max_sample > 0.0f) {
78  base.scale(static_cast<int>(base.max_samples.size()), 1.0f / max_sample);
79  }
80  }
81  return base;
82 }
83 
84 AudioWaveformData AudioWaveformer::ExtractSamples(const std::string& path, int channel, int num_per_second, bool normalize) {
85  FFmpegReader temp_reader(path);
86  temp_reader.Open();
87  // Disable video for speed
88  bool has_video = temp_reader.info.has_video;
89  temp_reader.info.has_video = false;
90  AudioWaveformData data = ExtractSamplesFromReader(&temp_reader, channel, num_per_second, normalize);
91  temp_reader.info.has_video = has_video;
92  temp_reader.Close();
93  return data;
94 }
95 
97  const Keyframe* time_keyframe,
98  const Keyframe* volume_keyframe,
99  const Fraction& project_fps,
100  int channel,
101  int num_per_second,
102  bool normalize) {
103  FFmpegReader temp_reader(path);
104  temp_reader.Open();
105  bool has_video = temp_reader.info.has_video;
106  temp_reader.info.has_video = false;
107  Fraction source_fps = temp_reader.info.fps;
108  AudioWaveformData base = ExtractSamplesFromReader(&temp_reader, channel, num_per_second, false);
109  temp_reader.info.has_video = has_video;
110  temp_reader.Close();
111  return ApplyKeyframes(base, time_keyframe, volume_keyframe, project_fps, source_fps, temp_reader.info.channels, num_per_second, channel, normalize);
112 }
113 
115  const Keyframe* time_keyframe,
116  const Keyframe* volume_keyframe,
117  const Fraction& project_fps,
118  const Fraction& source_fps,
119  int source_channels,
120  int num_per_second,
121  int channel,
122  bool normalize) {
123  AudioWaveformData data;
124  if (num_per_second <= 0) {
125  return data;
126  }
127 
128  double project_fps_value = project_fps.ToDouble();
129  double source_fps_value = source_fps.ToDouble();
130  if (project_fps_value <= 0.0 || source_fps_value <= 0.0) {
131  return data;
132  }
133 
134  if (channel != -1 && (channel < 0 || channel >= source_channels)) {
135  return data;
136  }
137 
138  size_t base_total = base.max_samples.size();
139  if (base_total == 0) {
140  return data;
141  }
142 
143  // Determine output duration from time curve (if any). Time curves are in project-frame domain.
144  int64_t output_frames = 0;
145  if (time_keyframe && time_keyframe->GetCount() > 0) {
146  output_frames = time_keyframe->GetLength();
147  }
148  if (output_frames <= 0) {
149  // Default to source duration derived from base waveform length
150  double source_duration = static_cast<double>(base_total) / static_cast<double>(num_per_second);
151  output_frames = static_cast<int64_t>(std::llround(source_duration * project_fps_value));
152  }
153  double output_duration_seconds = static_cast<double>(output_frames) / project_fps_value;
154  int total_samples = static_cast<int>(std::ceil(output_duration_seconds * num_per_second));
155 
156  if (total_samples <= 0) {
157  return data;
158  }
159 
160  data.resize(total_samples);
161  data.zero(total_samples);
162 
163  for (int i = 0; i < total_samples; ++i) {
164  double out_time = static_cast<double>(i) / static_cast<double>(num_per_second);
165  // Time keyframes are defined in project-frame domain; evaluate using project frames
166  double project_frame = out_time * project_fps_value;
167  double mapped_project_frame = time_keyframe ? time_keyframe->GetValue(project_frame) : project_frame;
168  // Convert mapped project frame to seconds (project FPS), then to waveform index
169  double source_time = mapped_project_frame / project_fps_value;
170  double source_index = source_time * static_cast<double>(num_per_second);
171 
172  // Sample base waveform (nearest with simple linear blend)
173  int idx0 = static_cast<int>(std::floor(source_index));
174  int idx1 = idx0 + 1;
175  double frac = source_index - static_cast<double>(idx0);
176 
177  float max_sample = 0.0f;
178  float rms_sample = 0.0f;
179  if (idx0 >= 0 && idx0 < static_cast<int>(base_total)) {
180  max_sample = base.max_samples[idx0];
181  rms_sample = base.rms_samples[idx0];
182  }
183  if (idx1 >= 0 && idx1 < static_cast<int>(base_total)) {
184  max_sample = static_cast<float>((1.0 - frac) * max_sample + frac * base.max_samples[idx1]);
185  rms_sample = static_cast<float>((1.0 - frac) * rms_sample + frac * base.rms_samples[idx1]);
186  }
187 
188  double gain = 1.0;
189  if (volume_keyframe) {
190  double project_frame = out_time * project_fps_value;
191  gain = volume_keyframe->GetValue(project_frame);
192  }
193  max_sample = static_cast<float>(max_sample * gain);
194  rms_sample = static_cast<float>(rms_sample * gain);
195 
196  data.max_samples[i] = max_sample;
197  data.rms_samples[i] = rms_sample;
198  }
199 
200  if (normalize) {
201  float samples_max = 0.0f;
202  for (auto v : data.max_samples) {
203  samples_max = std::max(samples_max, std::abs(v));
204  }
205  if (samples_max > 0.0f) {
206  data.scale(total_samples, 1.0f / samples_max);
207  }
208  }
209 
210  return data;
211 }
212 
213 AudioWaveformData AudioWaveformer::ExtractSamplesFromReader(ReaderBase* source_reader, int channel, int num_per_second, bool normalize) {
214  AudioWaveformData data;
215 
216  if (!source_reader || num_per_second <= 0) {
217  return data;
218  }
219 
220  // Open reader (if needed)
221  if (!source_reader->IsOpen()) {
222  source_reader->Open();
223  }
224 
225  const auto retry_delay = std::chrono::milliseconds(100);
226  const auto max_wait_for_open = std::chrono::milliseconds(3000);
227 
228  auto get_frame_with_retry = [&](int64_t frame_number) -> std::shared_ptr<openshot::Frame> {
229  std::chrono::steady_clock::time_point wait_start;
230  bool waiting_for_open = false;
231  while (true) {
232  try {
233  return source_reader->GetFrame(frame_number);
234  } catch (const openshot::ReaderClosed&) {
235  auto now = std::chrono::steady_clock::now();
236  if (!waiting_for_open) {
237  waiting_for_open = true;
238  wait_start = now;
239  } else if (now - wait_start >= max_wait_for_open) {
240  throw;
241  }
242 
243  std::this_thread::sleep_for(retry_delay);
244  }
245  }
246  };
247 
248  int sample_rate = source_reader->info.sample_rate;
249  if (sample_rate <= 0) {
250  sample_rate = num_per_second;
251  }
252  int sample_divisor = sample_rate / num_per_second;
253  if (sample_divisor <= 0) {
254  sample_divisor = 1;
255  }
256 
257  // Determine length of video frames (for waveform)
258  int64_t reader_video_length = source_reader->info.video_length;
259  if (reader_video_length < 0) {
260  reader_video_length = 0;
261  }
262  float reader_duration = source_reader->info.duration;
263  double fps_value = source_reader->info.fps.ToDouble();
264  float frames_duration = 0.0f;
265  if (reader_video_length > 0 && fps_value > 0.0) {
266  frames_duration = static_cast<float>(reader_video_length / fps_value);
267  }
268  if (reader_duration <= 0.0f) {
269  reader_duration = frames_duration;
270  }
271  if (reader_duration < 0.0f) {
272  reader_duration = 0.0f;
273  }
274 
275  if (!source_reader->info.has_audio) {
276  return data;
277  }
278 
279  int total_samples = static_cast<int>(std::ceil(reader_duration * num_per_second));
280  if (total_samples <= 0 || source_reader->info.channels == 0) {
281  return data;
282  }
283 
284  if (channel != -1 && (channel < 0 || channel >= source_reader->info.channels)) {
285  return data;
286  }
287 
288  // Resize and clear audio buffers
289  data.resize(total_samples);
290  data.zero(total_samples);
291 
292  int extracted_index = 0;
293  int sample_index = 0;
294  float samples_max = 0.0f;
295  float chunk_max = 0.0f;
296  double chunk_squared_sum = 0.0;
297 
298  int channel_count = (channel == -1) ? source_reader->info.channels : 1;
299  std::vector<float*> channels(source_reader->info.channels, nullptr);
300 
301  try {
302  for (int64_t f = 1; f <= reader_video_length && extracted_index < total_samples; f++) {
303  std::shared_ptr<openshot::Frame> frame = get_frame_with_retry(f);
304 
305  for (int channel_index = 0; channel_index < source_reader->info.channels; channel_index++) {
306  if (channel == channel_index || channel == -1) {
307  channels[channel_index] = frame->GetAudioSamples(channel_index);
308  }
309  }
310 
311  int sample_count = frame->GetAudioSamplesCount();
312  for (int s = 0; s < sample_count; s++) {
313  for (int channel_index = 0; channel_index < source_reader->info.channels; channel_index++) {
314  if (channel == channel_index || channel == -1) {
315  float *samples = channels[channel_index];
316  if (!samples) {
317  continue;
318  }
319  float abs_sample = std::abs(samples[s]);
320  chunk_squared_sum += static_cast<double>(samples[s]) * static_cast<double>(samples[s]);
321  chunk_max = std::max(chunk_max, abs_sample);
322  }
323  }
324 
325  sample_index += 1;
326 
327  if (sample_index % sample_divisor == 0) {
328  float avg_squared_sum = 0.0f;
329  if (channel_count > 0) {
330  avg_squared_sum = static_cast<float>(chunk_squared_sum / static_cast<double>(sample_divisor * channel_count));
331  }
332 
333  if (extracted_index < total_samples) {
334  data.max_samples[extracted_index] = chunk_max;
335  data.rms_samples[extracted_index] = std::sqrt(avg_squared_sum);
336  samples_max = std::max(samples_max, chunk_max);
337  extracted_index++;
338  }
339 
340  sample_index = 0;
341  chunk_max = 0.0f;
342  chunk_squared_sum = 0.0;
343 
344  if (extracted_index >= total_samples) {
345  break;
346  }
347  }
348  }
349  }
350  } catch (...) {
351  throw;
352  }
353 
354  if (sample_index > 0 && extracted_index < total_samples) {
355  float avg_squared_sum = 0.0f;
356  if (channel_count > 0) {
357  avg_squared_sum = static_cast<float>(chunk_squared_sum / static_cast<double>(sample_index * channel_count));
358  }
359 
360  data.max_samples[extracted_index] = chunk_max;
361  data.rms_samples[extracted_index] = std::sqrt(avg_squared_sum);
362  samples_max = std::max(samples_max, chunk_max);
363  extracted_index++;
364  }
365 
366  if (normalize && samples_max > 0.0f) {
367  float scale = 1.0f / samples_max;
368  data.scale(total_samples, scale);
369  }
370 
371  return data;
372 }
373 
374 ReaderBase* AudioWaveformer::ResolveSourceReader(ReaderBase* source_reader) {
375  if (!source_reader) {
376  return nullptr;
377  }
378 
379  ReaderBase* current = source_reader;
380  while (true) {
381  if (auto clip = dynamic_cast<Clip*>(current)) {
382  current = clip->Reader();
383  continue;
384  }
385  if (auto mapper = dynamic_cast<FrameMapper*>(current)) {
386  current = mapper->Reader();
387  continue;
388  }
389  break;
390  }
391  return current;
392 }
393 
394 Fraction AudioWaveformer::ResolveSourceFPS(ReaderBase* source_reader) {
395  if (!source_reader) {
396  return Fraction(0, 1);
397  }
398  return source_reader->info.fps;
399 }
400 
401 // Resolve and cache the reader used for waveform extraction (prefer a detached FFmpegReader clone)
402 ReaderBase* AudioWaveformer::ResolveWaveformReader() {
403  if (source_initialized) {
404  return resolved_reader ? resolved_reader : reader;
405  }
406  source_initialized = true;
407 
408  resolved_reader = ResolveSourceReader(reader);
409 
410  // Prefer a detached, audio-only FFmpegReader clone so we never mutate the live reader used for preview.
411  if (auto ff_reader = dynamic_cast<FFmpegReader*>(resolved_reader)) {
412  const Json::Value ff_json = ff_reader->JsonValue();
413  const std::string path = ff_json.get("path", "").asString();
414  if (!path.empty()) {
415  try {
416  auto clone = std::make_unique<FFmpegReader>(path, false);
417  clone->SetJsonValue(ff_json);
418  clone->info.has_video = false; // explicitly audio-only for waveform extraction
419  detached_reader = std::move(clone);
420  resolved_reader = detached_reader.get();
421  } catch (...) {
422  // Fall back to using the original reader if cloning fails
423  detached_reader.reset();
424  resolved_reader = ResolveSourceReader(reader);
425  }
426  }
427  }
428 
429  return resolved_reader ? resolved_reader : reader;
430 }
Header file for AudioWaveformer class.
Header file for Clip class.
Header file for all Exception classes.
Header file for FFmpegReader class.
Header file for the FrameMapper class.
Header file for Timeline class.
AudioWaveformData ApplyKeyframes(const AudioWaveformData &base, const openshot::Keyframe *time_keyframe, const openshot::Keyframe *volume_keyframe, const openshot::Fraction &project_fps, const openshot::Fraction &source_fps, int source_channels, int num_per_second, int channel, bool normalize)
Apply time and volume keyframes to an existing waveform data set.
AudioWaveformData ExtractSamples(int channel, int num_per_second, bool normalize)
Extract audio samples from any ReaderBase class (legacy overload, now delegates to audio-only path)
This class represents a clip (used to arrange readers on the timeline)
Definition: Clip.h:89
This class uses the FFmpeg libraries, to open video files and audio files, and return openshot::Frame...
Definition: FFmpegReader.h:103
void Open() override
Open File - which is called by the constructor automatically.
void Close() override
Close File.
This class represents a fraction.
Definition: Fraction.h:30
double ToDouble() const
Return this fraction as a double (i.e. 1/2 = 0.5)
Definition: Fraction.cpp:40
This class creates a mapping between 2 different frame rates, applying a specific pull-down technique...
Definition: FrameMapper.h:193
A Keyframe is a collection of Point instances, which is used to vary a number or property over time.
Definition: KeyFrame.h:53
int64_t GetLength() const
Definition: KeyFrame.cpp:417
double GetValue(int64_t index) const
Get the value at a specific index.
Definition: KeyFrame.cpp:258
int64_t GetCount() const
Get the number of points (i.e. # of points)
Definition: KeyFrame.cpp:424
This abstract class is the base class, used by all readers in libopenshot.
Definition: ReaderBase.h:76
virtual bool IsOpen()=0
Determine if reader is open or closed.
openshot::ReaderInfo info
Information about the current media file.
Definition: ReaderBase.h:90
virtual std::shared_ptr< openshot::Frame > GetFrame(int64_t number)=0
virtual void Open()=0
Open the reader (and start consuming resources, such as images or video files)
Exception when a reader is closed, and a frame is requested.
Definition: Exceptions.h:370
This class represents a timeline.
Definition: Timeline.h:153
This namespace is the default namespace for all code in the openshot library.
Definition: Compressor.h:29
This struct holds the extracted waveform data (both the RMS root-mean-squared average,...
void resize(int total_samples)
Resize both datasets.
std::vector< float > rms_samples
std::vector< float > max_samples
void zero(int total_samples)
Zero out # of values in both datasets.
void scale(int total_samples, float factor)
Scale # of values by some factor.
float duration
Length of time (in seconds)
Definition: ReaderBase.h:43
int channels
The number of audio channels used in the audio stream.
Definition: ReaderBase.h:61
openshot::Fraction fps
Frames per second, as a fraction (i.e. 24/1 = 24 fps)
Definition: ReaderBase.h:48
int64_t video_length
The number of frames in the video stream.
Definition: ReaderBase.h:53
bool has_video
Determines if this file has a video stream.
Definition: ReaderBase.h:40
bool has_audio
Determines if this file has an audio stream.
Definition: ReaderBase.h:41
int sample_rate
The number of audio samples per second (44100 is a common sample rate)
Definition: ReaderBase.h:60