Audio/video sync improvements for PyavOutput and CircularOutput2

davidplowman · davidplowman · commit 17862857e956 · 2024-11-06T13:40:18.000Z
Plus a few other miscellaneous fixes, and also a new streaming
example.

Signed-off-by: David Plowman &lt;david.plowman@raspberrypi.com&gt;
diff --git a/examples/pyav_stream2.py b/examples/pyav_stream2.py
@@ -0,0 +1,37 @@
+#!/usr/bin/python3
+
+import socket
+from threading import Event
+
+from picamera2 import Picamera2
+from picamera2.encoders import H264Encoder
+from picamera2.outputs import PyavOutput
+
+picam2 = Picamera2()
+video_config = picam2.create_video_configuration({"size": (1280, 720), 'format': 'YUV420'})
+picam2.configure(video_config)
+
+encoder = H264Encoder(bitrate=10000000)
+encoder.audio = True
+
+with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    sock.bind(("0.0.0.0", 8888))
+
+    while True:
+        print("Waiting")
+        sock.listen()
+
+        conn, addr = sock.accept()
+        print("Connected")
+
+        output = PyavOutput(f"pipe:{conn.fileno()}", format="mpegts")
+        event = Event()
+        output.error_callback = lambda e: event.set()  # noqa
+
+        picam2.start_recording(encoder, output)
+
+        event.wait()
+        print("Disconnected")
+
+        picam2.stop_recording()
diff --git a/picamera2/encoders/encoder.py b/picamera2/encoders/encoder.py
@@ -3,6 +3,7 @@
 import threading
 from enum import Enum
 
+import av
 from libcamera import controls
 
 import picamera2.formats as formats
@@ -25,7 +26,25 @@ class Quality(Enum):
 
 
 class Encoder:
-    """Base class for encoders"""
+    """
+    Base class for encoders.
+
+    Mostly this defines the API for derived encoder classes, but it also handles optional audio encoding.
+    For audio, a separate thread is started, which encodes audio packets and forwards them to the
+    encoder's output object(s). This only work when the output object understands the audio stream,
+    meaning that (at the time of writing) this must be a PyavOutput (though you could send output there
+    via a CircularOutput2).
+
+    Additional audio parameters:
+    audio - set to True to enable audio encoding and output.
+    audio_input - list of parameters that is passed to PyAv.open to create the audio input.
+    audio_output - list of parameters passed to PyAv add_stream to define the audio codec and output stream.
+    audio_sync - value (in us) by which to advance the audio stream to better sync with the video.
+
+    Reasonable defaults are supplied so that applications can often just set the audio property to True.
+    The audio_input and audio_output parameters are passed directly to PyAV, so will accept whatever PyAV
+    understands.
+    """
 
     def __init__(self):
         """Initialises encoder"""
@@ -40,6 +59,15 @@ def __init__(self):
         self.firsttimestamp = None
         self.frame_skip_count = 1
         self._skip_count = 0
+        self._output_lock = threading.Lock()
+        # Set to True to enable audio.
+        self.audio = False
+        # These parameters are passed to Pyav to open the input audio container.
+        self.audio_input = {'file': 'default', 'format': 'pulse'}
+        # THese parameters are passed to Pyav for creating the encoded audio output stream.
+        self.audio_output = {'codec_name': 'aac'}
+        self.audio_sync = -100000  # in us, so by default, delay audio by 100ms
+        self._audio_start = threading.Event()
 
     @property
     def running(self):
@@ -208,6 +236,8 @@ def encode(self, stream, request):
         :param request: Request
         :type request: request
         """
+        if self.audio:
+            self._audio_start.set()  # Signal the audio encode thread to start.
         if self._skip_count == 0:
             with self._lock:
                 self._encode(stream, request)
@@ -226,10 +256,24 @@ def start(self, quality=None):
                 raise RuntimeError("Encoder already running")
             self._setup(quality)
             self._running = True
+            self.firsttimestamp = None
             for out in self._output:
                 out.start()
             self._start()
 
+            # Start the audio, if that's been requested.
+            if self.audio:
+                self._audio_input_container = av.open(**self.audio_input)
+                self._audio_input_stream = self._audio_input_container.streams.get(audio=0)[0]
+                self._audio_output_container = av.open("/dev/null", 'w', format="null")
+                self._audio_output_stream = self._audio_output_container.add_stream(**self.audio_output)
+                # Outputs that can handle audio need to be told about its existence.
+                for out in self._output:
+                    out._add_stream(self._audio_output_stream, **self.audio_output)
+                self._audio_thread = threading.Thread(target=self._audio_thread_func, daemon=True)
+                self._audio_start.clear()
+                self._audio_thread.start()  # audio thread will wait for the _audio_start event.
+
     def _start(self):
         pass
 
@@ -239,26 +283,28 @@ def stop(self):
                 raise RuntimeError("Encoder already stopped")
             self._running = False
             self._stop()
+            if self.audio:
+                self._audio_start.set()  # just in case it wasn't!
+                self._audio_thread.join()
+                self._audio_input_container.close()
+                self._audio_output_container.close()
             for out in self._output:
                 out.stop()
 
     def _stop(self):
         pass
 
-    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None):
+    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None, audio=False):
         """Writes a frame
 
         :param frame: Frame
         :type frame: bytes
         :param keyframe: Whether frame is a keyframe or not, defaults to True
         :type keyframe: bool, optional
         """
-        if packet:
-            for out in self._output:
-                out.outputframe(frame, keyframe, timestamp, packet)
-        else:
+        with self._output_lock:
             for out in self._output:
-                out.outputframe(frame, keyframe, timestamp)
+                out.outputframe(frame, keyframe, timestamp, packet, audio)
 
     def _setup(self, quality):
         pass
@@ -272,3 +318,33 @@ def _timestamp(self, request):
         else:
             timestamp_us = ts - self.firsttimestamp
         return timestamp_us
+
+    def _handle_audio_packet(self, audio_packet):
+        # Write out audio an packet, dealing with timestamp adjustments.
+        time_scale_factor = 1000000 * self._audio_output_stream.codec_context.time_base
+        delta = int(self.audio_sync / time_scale_factor)  # convert to audio time base
+        audio_packet.pts -= delta
+        audio_packet.dts -= delta
+        timestamp = int(audio_packet.pts * time_scale_factor)  # want this in us
+        if audio_packet.pts >= 0:
+            self.outputframe(None, True, timestamp, audio_packet, True)
+
+    def _audio_thread_func(self):
+        # Audio thread that fetches audio packets, encodes them and forwards them to the output.
+        # The output has to be able to understand audio, which means using a PyavOutput.
+        # _audio_start gets signalled when the first video frame is submitted for encode, which will hopefully
+        # keep the audio_sync adjustment more similar across different devices. Until that happens, though,
+        # we must keep consuming and discarding the audio.
+        for _ in self._audio_input_container.decode(self._audio_input_stream):
+            if self._audio_start.isSet():
+                break
+
+        for audio_frame in self._audio_input_container.decode(self._audio_input_stream):
+            if not self._running:
+                break
+            for audio_packet in self._audio_output_stream.encode(audio_frame):
+                self._handle_audio_packet(audio_packet)
+
+        # Flush out any remaining audio packets.
+        for audio_packet in self._audio_output_stream.encode(None):
+            self._handle_audio_packet(audio_packet)
diff --git a/picamera2/outputs/circularoutput.py b/picamera2/outputs/circularoutput.py
@@ -40,7 +40,7 @@ def buffersize(self, value):
             self._buffersize = value
             self._circular = collections.deque(maxlen=value)
 
-    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None):
+    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None, audio=False):
         """Write frame to circular buffer
 
         :param frame: Frame
@@ -50,6 +50,8 @@ def outputframe(self, frame, keyframe=True, timestamp=None, packet=None):
         :param timestamp: Timestamp of frame
         :type timestamp: int
         """
+        if audio:
+            raise RuntimeError("CircularOutput does not support audio")
         with self._lock:
             if self._buffersize == 0:
                 return
diff --git a/picamera2/outputs/circularoutput2.py b/picamera2/outputs/circularoutput2.py
@@ -7,14 +7,19 @@
 
 
 class CircularOutput2(Output):
-    """Circular buffer implementation for general outputs
+    """
+    Circular buffer implementation, much like CircularOutput, but for general outputs.
+
+    This means it can be used in conjunction with, for example, a PyavOutput to create time-shifted
+    recordings of both video and audio straight to an mp4 file.
 
-    Very like the original CircularOutput, but this version can also be used with a
-    PyavOutput underneath, so as directly to create mp4 files.
+    Once the CircularOutput2 has been started, use the open_output method to start start recording
+    a new output, and use close_output when finished. If the output has not been closed when the
+    circular buffer is stopped, then the remainder of the buffer will be flush into the output.
     """
 
-    def __init__(self, pts=None, buffer_duration_ms=5000, always_output=True):
-        """Creates circular buffer for 5s worth of 30fps frames"""
+    def __init__(self, pts=None, buffer_duration_ms=5000):
+        """Create a CircularOutput2."""
         super().__init__(pts=pts)
         # A note on locking. The lock is principally to protect outputframe, which is called by
         # the background encoder thread. Applications are going to call things like open_output,
@@ -27,7 +32,6 @@ def __init__(self, pts=None, buffer_duration_ms=5000, always_output=True):
             raise RuntimeError("buffer_duration_ms may not be negative")
         self._buffer_duration_ms = buffer_duration_ms
         self._circular = collections.deque()
-        self.always_output = always_output
         self._output = None
         self._output_available = False
         self._streams = []
@@ -44,7 +48,7 @@ def buffer_duration_ms(self, value):
             self._buffer_duration_ms = value
 
     def open_output(self, output):
-        """Set a new output object"""
+        """Open a new output object and start writing to it."""
         if self._output:
             raise RuntimeError("Underlying output must be closed first")
 
@@ -72,29 +76,32 @@ def close_output(self):
         self._output = None
 
     def _get_frame(self):
+        # Fetch the next frame to be saved to the underlying output.
         if not self._circular:
             return
         if not self._first_frame:
             return self._circular.popleft()
         # Must skip ahead to the first I frame if we haven't seen one yet.
         while self._circular:
             entry = self._circular.popleft()
-            _, key_frame, _, _ = entry
-            if key_frame:
+            _, key_frame, _, _, audio = entry
+            # If there is audio, all audio frames are likely to be keyframes, so we must ignore them when
+            # deciding when the streams can resume - only the video counts.
+            if key_frame and not audio:
                 self._first_frame = False
                 return entry
 
-    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None):
+    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None, audio=False):
         """Write frame to circular buffer"""
         with self._lock:
             if self._buffer_duration_ms == 0 or not self.recording:
                 return
-            self._circular.append((frame, keyframe, timestamp, packet))
+            self._circular.append((frame, keyframe, timestamp, packet, audio))
             # Discard any expired buffer entries.
             while timestamp - self._circular[0][2] > self._buffer_duration_ms * 1000:
                 self._circular.popleft()
 
-            if self._output_available and self.always_output:
+            if self._output_available:
                 # Actually write this to the underlying output.
                 entry = self._get_frame()
                 if entry:
@@ -108,7 +115,7 @@ def start(self):
             self.recording = True
 
     def stop(self):
-        """Close file handle and prevent recording"""
+        """Close file handle and stop recording"""
         with self._lock:
             if not self.recording:
                 raise RuntimeError("Circular output was not started")
@@ -123,5 +130,7 @@ def stop(self):
             self._output.stop()
             self._output = None
 
-    def _add_stream(self, encoder_stream, codec, **kwargs):
-        self._streams.append((encoder_stream, codec, kwargs))
+    def _add_stream(self, encoder_stream, codec_name, **kwargs):
+        # Notice the PyavOutput of a stream that will be sending it packets to write out. It will need
+        # to forward these whenever a new underlying output is opened.
+        self._streams.append((encoder_stream, codec_name, kwargs))
diff --git a/picamera2/outputs/ffmpegoutput.py b/picamera2/outputs/ffmpegoutput.py
@@ -97,7 +97,9 @@ def stop(self):
             # This seems to be necessary to get the subprocess to clean up fully.
             gc.collect()
 
-    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None):
+    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None, audio=False):
+        if audio:
+            raise RuntimeError("FfmpegOutput does not support audio packets from Picamera2")
         if self.recording and self.ffmpeg:
             # Handle the case where the FFmpeg prcoess has gone away for reasons of its own.
             try:
diff --git a/picamera2/outputs/fileoutput.py b/picamera2/outputs/fileoutput.py
@@ -72,7 +72,7 @@ def connectiondead(self, _callback):
         else:
             raise RuntimeError("Must pass callback function or None")
 
-    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None):
+    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None, audio=False):
         """Outputs frame from encoder
 
         :param frame: Frame
@@ -82,6 +82,8 @@ def outputframe(self, frame, keyframe=True, timestamp=None, packet=None):
         :param timestamp: Timestamp of frame
         :type timestamp: int
         """
+        if audio:
+            raise RuntimeError("Fileoutput does not support audio")
         if self._fileoutput is not None and self.recording:
             if self._firstframe:
                 if not keyframe:
diff --git a/picamera2/outputs/output.py b/picamera2/outputs/output.py
@@ -22,7 +22,7 @@ def stop(self):
         """Stop recording"""
         self.recording = False
 
-    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None):
+    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None, audio=False):
         """Outputs frame from encoder
 
         :param frame: Frame
diff --git a/picamera2/outputs/pyavoutput.py b/picamera2/outputs/pyavoutput.py
diff --git a/picamera2/picamera2.py b/picamera2/picamera2.py
diff --git a/tests/check_timestamps.py b/tests/check_timestamps.py