Skip to content

Commit a9e3b31

Browse files
authored
Add support for Audio Events (#84)
1 parent 9f741eb commit a9e3b31

File tree

6 files changed

+77
-12
lines changed

6 files changed

+77
-12
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [1.14.0] - 2024-02-12
9+
10+
### Added
11+
12+
- Support for the Audio Events feature
13+
814
## [1.13.1] - 2023-12-21
915

1016
### Changed

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.13.1
1+
1.14.0

speechmatics/cli.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import sys
1212
from dataclasses import dataclass
1313
from socket import gaierror
14-
from typing import List
14+
from typing import Any, Dict, List
1515

1616
import httpx
1717
import toml
@@ -26,6 +26,7 @@
2626
from speechmatics.exceptions import JobNotFoundException, TranscriptionError
2727
from speechmatics.helpers import _process_status_errors
2828
from speechmatics.models import (
29+
AudioEventsConfig,
2930
AudioSettings,
3031
AutoChaptersConfig,
3132
BatchLanguageIdentificationConfig,
@@ -198,7 +199,7 @@ def get_transcription_config(
198199
config = json.load(config_file)
199200
else:
200201
# Ensure "en" is the default language as to not break existing API behavior.
201-
config = {"language": "en"}
202+
config: Dict[str, Any] = {"language": "en"}
202203

203204
# transcription_config is flattened in the BatchTranscriptionConfig,
204205
# so the config entry from JSON must be flattened here, otherwise the JSON entry would be ignored
@@ -341,6 +342,14 @@ def get_transcription_config(
341342
if args_auto_chapters or auto_chapters_config is not None:
342343
config["auto_chapters_config"] = AutoChaptersConfig()
343344

345+
audio_events_config = config.get("audio_events_config", None)
346+
arg_audio_events = args.get("audio_events")
347+
if audio_events_config or arg_audio_events is not None:
348+
types = None
349+
if audio_events_config and audio_events_config.get("types"):
350+
types = audio_events_config.get("types")
351+
config["audio_events_config"] = AudioEventsConfig(types)
352+
344353
if args["mode"] == "rt":
345354
# pylint: disable=unexpected-keyword-arg
346355
return TranscriptionConfig(**config)
@@ -448,6 +457,14 @@ def transcript_handler(message):
448457
sys.stdout.write(f"{escape_seq}{plaintext}\n")
449458
transcripts.text += plaintext
450459

460+
def audio_event_handler(message):
461+
if print_json:
462+
print(json.dumps(message))
463+
return
464+
event_name = message["event"].get("type", "").upper()
465+
sys.stdout.write(f"{escape_seq}[{event_name}]\n")
466+
transcripts.text += f"[{event_name}] "
467+
451468
def partial_translation_handler(message):
452469
if print_json:
453470
print(json.dumps(message))
@@ -480,6 +497,8 @@ def end_of_transcript_handler(_):
480497
# print both transcription and translation messages (if json was requested)
481498
# print translation (if text was requested then)
482499
# print transcription (if text was requested without translation)
500+
501+
api.add_event_handler(ServerMessageType.AudioEventStarted, audio_event_handler)
483502
if print_json:
484503
if enable_partials or enable_translation_partials:
485504
api.add_event_handler(

speechmatics/cli_parser.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,12 @@ def get_arg_parser():
467467
help="Which type of diarization to use.",
468468
)
469469

470+
rt_transcribe_command_parser.add_argument(
471+
"--audio-events",
472+
action="store_true",
473+
help="Enable audio event detection and print events in square-brackets to the console, e.g. [MUSIC]",
474+
)
475+
470476
# Build our actual parsers.
471477
mode_subparsers = parser.add_subparsers(title="Mode", dest="mode")
472478

speechmatics/client.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ def __init__(
7272
self.connection_settings.set_missing_values_from_config(UsageMode.RealTime)
7373
self.websocket = None
7474
self.transcription_config = None
75-
self.translation_config = None
7675

7776
self.event_handlers = {x: [] for x in ServerMessageType}
7877
self.middlewares = {x: [] for x in ClientMessageType}
@@ -135,12 +134,19 @@ def _set_recognition_config(self):
135134
:py:attr:`speechmatics.models.ClientMessageType.SetRecognitionConfig`
136135
message.
137136
"""
137+
assert self.transcription_config is not None
138138
msg = {
139139
"message": ClientMessageType.SetRecognitionConfig,
140140
"transcription_config": self.transcription_config.as_config(),
141141
}
142-
if self.translation_config is not None:
143-
msg["translation_config"] = self.translation_config.asdict()
142+
if self.transcription_config.translation_config is not None:
143+
msg[
144+
"translation_config"
145+
] = self.transcription_config.translation_config.asdict()
146+
if self.transcription_config.audio_events_config is not None:
147+
msg[
148+
"audio_events_config"
149+
] = self.transcription_config.audio_events_config.asdict()
144150
self._call_middleware(ClientMessageType.SetRecognitionConfig, msg, False)
145151
return msg
146152

@@ -155,13 +161,20 @@ def _start_recognition(self, audio_settings):
155161
:param audio_settings: Audio settings to use.
156162
:type audio_settings: speechmatics.models.AudioSettings
157163
"""
164+
assert self.transcription_config is not None
158165
msg = {
159166
"message": ClientMessageType.StartRecognition,
160167
"audio_format": audio_settings.asdict(),
161168
"transcription_config": self.transcription_config.as_config(),
162169
}
163-
if self.translation_config is not None:
164-
msg["translation_config"] = self.translation_config.asdict()
170+
if self.transcription_config.translation_config is not None:
171+
msg[
172+
"translation_config"
173+
] = self.transcription_config.translation_config.asdict()
174+
if self.transcription_config.audio_events_config is not None:
175+
msg[
176+
"audio_events_config"
177+
] = self.transcription_config.audio_events_config.asdict()
165178
self.session_running = True
166179
self._call_middleware(ClientMessageType.StartRecognition, msg, False)
167180
LOGGER.debug(msg)
@@ -435,7 +448,6 @@ async def run(
435448
consumer/producer tasks.
436449
"""
437450
self.transcription_config = transcription_config
438-
self.translation_config = transcription_config.translation_config
439451
self.seq_no = 0
440452
self._language_pack_info = None
441453
await self._init_synchronization_primitives()

speechmatics/models.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ class BatchTranslationConfig(TranslationConfig):
176176
class BatchLanguageIdentificationConfig:
177177
"""Batch mode: Language identification config."""
178178

179-
expected_languages: List[str] = None
179+
expected_languages: Optional[List[str]] = None
180180
"""Expected languages for language identification"""
181181

182182

@@ -203,7 +203,7 @@ class SentimentAnalysisConfig:
203203
class TopicDetectionConfig:
204204
"""Defines topic detection parameters."""
205205

206-
topics: List[str] = None
206+
topics: Optional[List[str]] = None
207207
"""Optional list of topics for topic detection."""
208208

209209

@@ -212,6 +212,18 @@ class AutoChaptersConfig:
212212
"""Auto Chapters config."""
213213

214214

215+
@dataclass
216+
class AudioEventsConfig:
217+
218+
types: Optional[List[str]]
219+
"""Optional list of audio event types to detect."""
220+
221+
def asdict(self):
222+
if self.types is None:
223+
self.types = []
224+
return asdict(self)
225+
226+
215227
@dataclass(init=False)
216228
class TranscriptionConfig(_TranscriptionConfig):
217229
# pylint: disable=too-many-instance-attributes
@@ -254,12 +266,16 @@ class TranscriptionConfig(_TranscriptionConfig):
254266
"""Indicates if partial translation, where words are produced
255267
immediately, is enabled."""
256268

257-
translation_config: TranslationConfig = None
269+
translation_config: Optional[TranslationConfig] = None
258270
"""Optional configuration for translation."""
259271

272+
audio_events_config: Optional[AudioEventsConfig] = None
273+
"""Optional configuration for audio events"""
274+
260275
def as_config(self):
261276
dictionary = self.asdict()
262277
dictionary.pop("translation_config", None)
278+
dictionary.pop("audio_events_config", None)
263279
dictionary.pop("enable_translation_partials", None)
264280
enable_transcription_partials = dictionary.pop(
265281
"enable_transcription_partials", False
@@ -504,6 +520,12 @@ class ServerMessageType(str, Enum):
504520
AddTranscript = "AddTranscript"
505521
"""Indicates the final transcript of a part of the audio."""
506522

523+
AudioEventStarted = "AudioEventStarted"
524+
"""Indicates the start of an audio event."""
525+
526+
AudioEventEnded = "AudioEventEnded"
527+
"""Indicates the end of an audio event."""
528+
507529
AddPartialTranslation = "AddPartialTranslation"
508530
"""Indicates a partial translation, which is an incomplete translation that
509531
is immediately produced and may change as more context becomes available.

0 commit comments

Comments
 (0)