diff --git a/README.md b/README.md index f4320c54..c10ba0c3 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,47 @@ +### Change log [2025-11-30 12:16:49] +1. Item Updated: `histogram_data_drift` (from version: `1.0.0` to `1.0.0`) +2. Item Updated: `openai_proxy_app` (from version: `1.0.0` to `1.0.0`) +3. Item Updated: `count_events` (from version: `1.0.0` to `1.0.0`) +4. Item Updated: `evidently_iris` (from version: `1.0.0` to `1.0.0`) + +### Change log [2025-11-30 12:16:40] +1. Item Updated: `test_classifier` (from version: `1.1.0` to `1.1.0`) +2. Item Updated: `sklearn_classifier` (from version: `1.2.0` to `1.2.0`) +3. Item Updated: `model_server_tester` (from version: `1.1.0` to `1.1.0`) +4. Item Updated: `azureml_serving` (from version: `1.1.0` to `1.1.0`) +5. Item Updated: `describe_dask` (from version: `1.2.0` to `1.2.0`) +6. Item Updated: `batch_inference` (from version: `1.8.0` to `1.8.0`) +7. Item Updated: `v2_model_server` (from version: `1.2.0` to `1.2.0`) +8. Item Updated: `gen_class_data` (from version: `1.3.0` to `1.3.0`) +9. Item Updated: `send_email` (from version: `1.2.0` to `1.2.0`) +10. Item Updated: `tf2_serving` (from version: `1.1.0` to `1.1.0`) +11. Item Updated: `aggregate` (from version: `1.4.0` to `1.4.0`) +12. Item Updated: `open_archive` (from version: `1.2.0` to `1.2.0`) +13. Item Updated: `describe` (from version: `1.4.0` to `1.4.0`) +14. Item Updated: `v2_model_tester` (from version: `1.1.0` to `1.1.0`) +15. Item Updated: `text_to_audio_generator` (from version: `1.3.0` to `1.3.0`) +16. Item Updated: `pii_recognizer` (from version: `0.4.0` to `0.4.0`) +17. Item Updated: `github_utils` (from version: `1.1.0` to `1.1.0`) +18. Item Updated: `sklearn_classifier_dask` (from version: `1.1.1` to `1.1.1`) +19. Item Updated: `azureml_utils` (from version: `1.4.0` to `1.4.0`) +20. Item Updated: `question_answering` (from version: `0.5.0` to `0.5.0`) +21. Item Updated: `structured_data_generator` (from version: `1.6.0` to `1.6.0`) +22. Item Updated: `arc_to_parquet` (from version: `1.5.0` to `1.5.0`) +23. Item Updated: `silero_vad` (from version: `1.4.0` to `1.4.0`) +24. Item Updated: `load_dataset` (from version: `1.2.0` to `1.2.0`) +25. Item Updated: `auto_trainer` (from version: `1.8.0` to `1.8.0`) +26. Item Updated: `feature_selection` (from version: `1.6.0` to `1.6.0`) +27. Item Updated: `translate` (from version: `0.3.0` to `0.3.0`) +28. Item Updated: `describe_spark` (from version: `1.1.0` to `1.1.0`) +29. Item Updated: `pyannote_audio` (from version: `1.3.0` to `1.3.0`) +30. Item Updated: `onnx_utils` (from version: `1.3.0` to `1.3.0`) +31. Item Updated: `batch_inference_v2` (from version: `2.6.0` to `2.6.0`) +32. Item Updated: `transcribe` (from version: `1.2.0` to `1.2.0`) +33. Item Updated: `model_server` (from version: `1.2.0` to `1.2.0`) +34. Item Updated: `mlflow_utils` (from version: `1.1.0` to `1.1.0`) +35. Item Updated: `noise_reduction` (from version: `1.1.0` to `1.1.0`) +36. Item Updated: `hugging_face_serving` (from version: `1.1.0` to `1.1.0`) + ### Change log [2025-11-26 11:49:13] 1. Item Updated: `histogram_data_drift` (from version: `1.0.0` to `1.0.0`) 2. Item Updated: `openai_proxy_app` (from version: `1.0.0` to `1.0.0`) diff --git a/functions/development/noise_reduction/1.1.0/static/documentation.html b/functions/development/noise_reduction/1.1.0/static/documentation.html index a772e518..c798c0e2 100644 --- a/functions/development/noise_reduction/1.1.0/static/documentation.html +++ b/functions/development/noise_reduction/1.1.0/static/documentation.html @@ -165,32 +165,7 @@

Contents

@@ -204,246 +179,8 @@

noise_reduction package

Submodules#

-
-

noise_reduction.noise_reduction module#

-
-
-class noise_reduction.noise_reduction.DFN(target_directory: Path, verbose: bool = True, silence_threshold: float | None = None, pad: bool = True, atten_lim_db: int | None = None, **kwargs)[source]#
-

Bases: ReduceNoiseBase

-
-
-clean_audio(data: torch.Tensor) torch.Tensor[source]#
-

Clean the audio from noise. Here you should implement the noise reduction algorithm.

-
-
Parameters:
-

data – The audio data to clean.

-
-
Returns:
-

The cleaned audio.

-
-
-
-
-
-load_audio(file: str) torch.Tensor[source]#
-

Load the audio from a file.

-
-
Parameters:
-

file – The file to load the audio from.

-
-
Returns:
-

A tuple of: -- the audio data -- the sample rate

-
-
-
-
-
-save_audio(audio: ndarray, target_path: Path)[source]#
-

Save the audio to a file.

-
-
Parameters:
-
    -
  • audio – The audio to save.

  • -
  • target_path – The target path to save the audio to.

  • -
-
-
-
-
-
-
-class noise_reduction.noise_reduction.ReduceNoise(target_directory: Path, verbose: bool = True, silence_threshold: float | None = None, sample_rate: int = 16000, duration: int | None = None, channel: int | None = None)[source]#
-

Bases: ReduceNoiseBase

-
-
-clean_audio(data: ndarray) ndarray[source]#
-

Clean the audio from noise. Here you should implement the noise reduction algorithm.

-
-
Parameters:
-

data – The audio data to clean.

-
-
Returns:
-

The cleaned audio.

-
-
-
-
-
-load_audio(file: str) ndarray[source]#
-

Load the audio from a file.

-
-
Parameters:
-

file – The file to load the audio from.

-
-
Returns:
-

A tuple of: -- the audio data -- the sample rate

-
-
-
-
-
-save_audio(audio: ndarray, target_path: Path)[source]#
-

Save the audio to a file.

-
-
Parameters:
-
    -
  • audio – The audio to save.

  • -
  • target_path – The target path to save the audio to.

  • -
-
-
-
-
-
-
-class noise_reduction.noise_reduction.ReduceNoiseBase(target_directory: Path, verbose: bool = True, silence_threshold: float | None = None)[source]#
-

Bases: object

-

Base class for noise reduction. -This class is aimed to be inherited by specific noise reduction algorithms. -You must implement the following methods: -- clean_audio: The method to clean the audio, where the noise reduction algorithm is implemented. -- save_audio: The method to save the audio to a file. -- load_audio: The method to load the audio from a file.

-

After implementing the above methods, you can use the reduce_noise method to reduce noise from audio files.

-
-
-abstract clean_audio(data) ndarray | torch.Tensor[source]#
-

Clean the audio from noise. Here you should implement the noise reduction algorithm.

-
-
Parameters:
-

data – The audio data to clean.

-
-
Returns:
-

The cleaned audio.

-
-
-
-
-
-abstract load_audio(file: str) Tuple[ndarray | torch.Tensor, int][source]#
-

Load the audio from a file.

-
-
Parameters:
-

file – The file to load the audio from.

-
-
Returns:
-

A tuple of: -- the audio data -- the sample rate

-
-
-
-
-
-reduce_noise(audio_file: Path) Tuple[bool, Tuple[str, str]][source]#
-

Reduce noise from the given audio file.

-
-
Parameters:
-

audio_file – The audio file to reduce noise from.

-
-
Returns:
-

A tuple of: -- a boolean indicating whether an error occurred -- a tuple of:

-
-
    -
  • audio file name

  • -
  • target path in case of success / error message in case of failure.

  • -
-
-

-
-
-
-
-
-remove_silence(audio: ndarray)[source]#
-

Remove silence sections from the audio.

-
-
Parameters:
-

audio – The audio to remove silence from.

-
-
Returns:
-

The audio without silence.

-
-
-
-
-
-abstract save_audio(audio: ndarray, target_path: Path)[source]#
-

Save the audio to a file.

-
-
Parameters:
-
    -
  • audio – The audio to save.

  • -
  • target_path – The target path to save the audio to.

  • -
-
-
-
-
-
-update_to_wav_suffix(audio_file: Path)[source]#
-
-
-
-
-noise_reduction.noise_reduction.reduce_noise(audio_source: str, target_directory: str, sample_rate: int = 16000, duration: int | None = None, channel: int | None = None, silence_threshold: float | None = None, use_multiprocessing: int = 0, verbose: bool = True)[source]#
-

Reduce noise from audio file or directory containing audio files. -The audio files must be in .wav format. -The cleaned audio files will be saved in the target_directory. -For information about the noise reduction algorithm see: -timsainb/noisereduce -Notice that the saved files are in wav format, even if the original files are in other format.

-
-
Parameters:
-
    -
  • audio_source – path to audio file or directory containing audio files

  • -
  • target_directory – path to directory to save the cleaned audio files.

  • -
  • sample_rate – Number of samples in one second in the audio file. -Pass None to keep the original sample rate.

  • -
  • duration – Duration of the audio file to clean in seconds. -Pass None to keep the original duration.

  • -
  • channel – Channel to clean. Pass the number of the channel to clean. -To clean all channels pass None.

  • -
  • silence_threshold – The threshold to remove silence from the audio, in dB. -If None, no silence removal is performed.

  • -
  • use_multiprocessing – Number of processes to use for cleaning the audio files. -If 0, no multiprocessing is used.

  • -
  • verbose – Verbosity level. If True, display progress bar.

  • -
-
-
-
-
-
-noise_reduction.noise_reduction.reduce_noise_dfn(audio_source: str, target_directory: str, pad: bool = True, atten_lim_db: int | None = None, silence_threshold: float | None = None, use_multiprocessing: int = 0, verbose: bool = True, **kwargs)[source]#
-

Reduce noise from audio files using DeepFilterNet. -For more information about the noise reduction algorithm see: -Rikorose/DeepFilterNet -Notice that the saved files are in wav format, even if the original files are in other format.

-
-
Parameters:
-
    -
  • audio_source – path to audio file or directory of audio files

  • -
  • target_directory – path to target directory to save cleaned audio files

  • -
  • pad – whether to pad the audio file with zeros before cleaning

  • -
  • atten_lim_db – maximum attenuation in dB

  • -
  • silence_threshold – the threshold to remove silence from the audio, in dB. If None, no silence removal is -performed.

  • -
  • use_multiprocessing – Number of processes to use for cleaning the audio files. -If 0, no multiprocessing is used.

  • -
  • verbose – verbosity level. If True, display progress bar and logs.

  • -
  • kwargs – additional arguments to pass to torchaudio.load(). For more information see: -https://pytorch.org/audio/stable/generated/torchaudio.load.html

  • -
-
-
-
+
+

noise_reduction.noise_reduction module#

Module contents#

@@ -463,32 +200,7 @@

Submodules diff --git a/functions/development/noise_reduction/latest/static/documentation.html b/functions/development/noise_reduction/latest/static/documentation.html index a772e518..c798c0e2 100644 --- a/functions/development/noise_reduction/latest/static/documentation.html +++ b/functions/development/noise_reduction/latest/static/documentation.html @@ -165,32 +165,7 @@

Contents

@@ -204,246 +179,8 @@

noise_reduction package

Submodules#

-
-

noise_reduction.noise_reduction module#

-
-
-class noise_reduction.noise_reduction.DFN(target_directory: Path, verbose: bool = True, silence_threshold: float | None = None, pad: bool = True, atten_lim_db: int | None = None, **kwargs)[source]#
-

Bases: ReduceNoiseBase

-
-
-clean_audio(data: torch.Tensor) torch.Tensor[source]#
-

Clean the audio from noise. Here you should implement the noise reduction algorithm.

-
-
Parameters:
-

data – The audio data to clean.

-
-
Returns:
-

The cleaned audio.

-
-
-
-
-
-load_audio(file: str) torch.Tensor[source]#
-

Load the audio from a file.

-
-
Parameters:
-

file – The file to load the audio from.

-
-
Returns:
-

A tuple of: -- the audio data -- the sample rate

-
-
-
-
-
-save_audio(audio: ndarray, target_path: Path)[source]#
-

Save the audio to a file.

-
-
Parameters:
-
    -
  • audio – The audio to save.

  • -
  • target_path – The target path to save the audio to.

  • -
-
-
-
-
-
-
-class noise_reduction.noise_reduction.ReduceNoise(target_directory: Path, verbose: bool = True, silence_threshold: float | None = None, sample_rate: int = 16000, duration: int | None = None, channel: int | None = None)[source]#
-

Bases: ReduceNoiseBase

-
-
-clean_audio(data: ndarray) ndarray[source]#
-

Clean the audio from noise. Here you should implement the noise reduction algorithm.

-
-
Parameters:
-

data – The audio data to clean.

-
-
Returns:
-

The cleaned audio.

-
-
-
-
-
-load_audio(file: str) ndarray[source]#
-

Load the audio from a file.

-
-
Parameters:
-

file – The file to load the audio from.

-
-
Returns:
-

A tuple of: -- the audio data -- the sample rate

-
-
-
-
-
-save_audio(audio: ndarray, target_path: Path)[source]#
-

Save the audio to a file.

-
-
Parameters:
-
    -
  • audio – The audio to save.

  • -
  • target_path – The target path to save the audio to.

  • -
-
-
-
-
-
-
-class noise_reduction.noise_reduction.ReduceNoiseBase(target_directory: Path, verbose: bool = True, silence_threshold: float | None = None)[source]#
-

Bases: object

-

Base class for noise reduction. -This class is aimed to be inherited by specific noise reduction algorithms. -You must implement the following methods: -- clean_audio: The method to clean the audio, where the noise reduction algorithm is implemented. -- save_audio: The method to save the audio to a file. -- load_audio: The method to load the audio from a file.

-

After implementing the above methods, you can use the reduce_noise method to reduce noise from audio files.

-
-
-abstract clean_audio(data) ndarray | torch.Tensor[source]#
-

Clean the audio from noise. Here you should implement the noise reduction algorithm.

-
-
Parameters:
-

data – The audio data to clean.

-
-
Returns:
-

The cleaned audio.

-
-
-
-
-
-abstract load_audio(file: str) Tuple[ndarray | torch.Tensor, int][source]#
-

Load the audio from a file.

-
-
Parameters:
-

file – The file to load the audio from.

-
-
Returns:
-

A tuple of: -- the audio data -- the sample rate

-
-
-
-
-
-reduce_noise(audio_file: Path) Tuple[bool, Tuple[str, str]][source]#
-

Reduce noise from the given audio file.

-
-
Parameters:
-

audio_file – The audio file to reduce noise from.

-
-
Returns:
-

A tuple of: -- a boolean indicating whether an error occurred -- a tuple of:

-
-
    -
  • audio file name

  • -
  • target path in case of success / error message in case of failure.

  • -
-
-

-
-
-
-
-
-remove_silence(audio: ndarray)[source]#
-

Remove silence sections from the audio.

-
-
Parameters:
-

audio – The audio to remove silence from.

-
-
Returns:
-

The audio without silence.

-
-
-
-
-
-abstract save_audio(audio: ndarray, target_path: Path)[source]#
-

Save the audio to a file.

-
-
Parameters:
-
    -
  • audio – The audio to save.

  • -
  • target_path – The target path to save the audio to.

  • -
-
-
-
-
-
-update_to_wav_suffix(audio_file: Path)[source]#
-
-
-
-
-noise_reduction.noise_reduction.reduce_noise(audio_source: str, target_directory: str, sample_rate: int = 16000, duration: int | None = None, channel: int | None = None, silence_threshold: float | None = None, use_multiprocessing: int = 0, verbose: bool = True)[source]#
-

Reduce noise from audio file or directory containing audio files. -The audio files must be in .wav format. -The cleaned audio files will be saved in the target_directory. -For information about the noise reduction algorithm see: -timsainb/noisereduce -Notice that the saved files are in wav format, even if the original files are in other format.

-
-
Parameters:
-
    -
  • audio_source – path to audio file or directory containing audio files

  • -
  • target_directory – path to directory to save the cleaned audio files.

  • -
  • sample_rate – Number of samples in one second in the audio file. -Pass None to keep the original sample rate.

  • -
  • duration – Duration of the audio file to clean in seconds. -Pass None to keep the original duration.

  • -
  • channel – Channel to clean. Pass the number of the channel to clean. -To clean all channels pass None.

  • -
  • silence_threshold – The threshold to remove silence from the audio, in dB. -If None, no silence removal is performed.

  • -
  • use_multiprocessing – Number of processes to use for cleaning the audio files. -If 0, no multiprocessing is used.

  • -
  • verbose – Verbosity level. If True, display progress bar.

  • -
-
-
-
-
-
-noise_reduction.noise_reduction.reduce_noise_dfn(audio_source: str, target_directory: str, pad: bool = True, atten_lim_db: int | None = None, silence_threshold: float | None = None, use_multiprocessing: int = 0, verbose: bool = True, **kwargs)[source]#
-

Reduce noise from audio files using DeepFilterNet. -For more information about the noise reduction algorithm see: -Rikorose/DeepFilterNet -Notice that the saved files are in wav format, even if the original files are in other format.

-
-
Parameters:
-
    -
  • audio_source – path to audio file or directory of audio files

  • -
  • target_directory – path to target directory to save cleaned audio files

  • -
  • pad – whether to pad the audio file with zeros before cleaning

  • -
  • atten_lim_db – maximum attenuation in dB

  • -
  • silence_threshold – the threshold to remove silence from the audio, in dB. If None, no silence removal is -performed.

  • -
  • use_multiprocessing – Number of processes to use for cleaning the audio files. -If 0, no multiprocessing is used.

  • -
  • verbose – verbosity level. If True, display progress bar and logs.

  • -
  • kwargs – additional arguments to pass to torchaudio.load(). For more information see: -https://pytorch.org/audio/stable/generated/torchaudio.load.html

  • -
-
-
-
+
+

noise_reduction.noise_reduction module#

Module contents#

@@ -463,32 +200,7 @@

Submodules diff --git a/functions/development/pyannote_audio/1.3.0/static/documentation.html b/functions/development/pyannote_audio/1.3.0/static/documentation.html index e28a243b..c3132429 100644 --- a/functions/development/pyannote_audio/1.3.0/static/documentation.html +++ b/functions/development/pyannote_audio/1.3.0/static/documentation.html @@ -165,11 +165,7 @@

Contents

@@ -183,75 +179,8 @@

pyannote_audio package

Submodules#

-
-

pyannote_audio.pyannote_audio module#

-
-
-pyannote_audio.pyannote_audio.diarize(data_path: str | List[str], model_name: str = 'pyannote/speaker-diarization-3.0', access_token: str | None = None, device: str | None = None, speakers_labels: List[str] | None = None, speaker_prefix: str = 'speaker_', separate_by_channels: bool = False, minimum_speakers: int | None = None, maximum_speakers: int | None = None, verbose: bool = False) Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]][source]#
-

Perform speech diarization on given audio files using pyannote-audio (pyannote/pyannote-audio). -The end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list -of tuples: (start, end, speaker_label).

-

To use the pyannote.audio models you must pass a Huggingface token and get access to the required models. The -token can be passed in one of the following options:

-
    -
  • Use the parameter access_token.

  • -
  • Set an environment variable named “HUGGING_FACE_HUB_TOKEN”.

  • -
  • If using MLRun, you can pass it as a secret named “HUGGING_FACE_HUB_TOKEN”.

  • -
-

To get access to the models on Huggingface, visit their page. For example, to use the default diarization model set -in this function (“pyannote/speaker-diarization-3.0”), you need access for these two models:

- -

Note: To control the recognized speakers in the diarization output you can choose one of the following methods:

-
    -
  • For a known speakers amount, you may set speaker labels via the speakers_labels parameter that will be used in -the order of speaking in the audio (first person speaking be the first label in the list). In addition, you can do -diarization per channel (setting the parameter separate_by_channels to True). Each label will be assigned to a -specific channel by order (first label to channel 0, second label to channel 1 and so on). Notice, this will -increase runtime.

  • -
  • For unknown speakers amount, you can set the speaker_prefix parameter to add a prefix for each speaker number. -You can also help the diarization by setting the speakers range via the speakers_amount_range parameter.

  • -
-
-
Parameters:
-
    -
  • data_path – A directory of the audio files, a single file or a list of files to transcribe.

  • -
  • model_name – One of the official diarization model names (referred as diarization pipelines) of -pyannote.audio Huggingface page. Default: “pyannote/speaker-diarization-3.0”.

  • -
  • access_token – An access token to pass for using the pyannote.audio models. If not provided, it -will be looking for the environment variable “HUGGING_FACE_HUB_TOKEN”. If MLRun is -available, it will look for a secret “HUGGING_FACE_HUB_TOKEN”.

  • -
  • device – Device to load the model. Can be one of {“cuda”, “cpu”}. Default will prefer “cuda” if -available.

  • -
  • speakers_labels – Labels to use for the recognized speakers. Default: numeric labels (0, 1, …).

  • -
  • separate_by_channels – If each speaker is speaking in a separate channel, you can diarize each channel and -combine the result into a single diarization. Each label set in the speakers_labels -parameter will be assigned to a specific channel by order.

  • -
  • speaker_prefix – A prefix to add for the speakers labels. This parameter is ignored if -speakers_labels is not None. Default: “speaker”.

  • -
  • minimum_speakers – Set the minimum expected amount of speakers to be in the audio files. This parameter is -ignored if speakers_labels is not None.

  • -
  • maximum_speakers – Set the maximum expected amount of speakers to be in the audio files. This parameter is -ignored if speakers_labels is not None.

  • -
  • verbose – Whether to present logs of a progress bar and errors. Default: True.

  • -
-
-
Returns:
-

A tuple of:

-
    -
  • Speech diarization dictionary.

  • -
  • A dictionary of errored files that were not transcribed.

  • -
-

-
-
-
-
-
-pyannote_audio.pyannote_audio.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Dict[str, Any] | None = None)[source]#
-
+
+

pyannote_audio.pyannote_audio module#

Module contents#

@@ -271,11 +200,7 @@

Submodules diff --git a/functions/development/pyannote_audio/latest/static/documentation.html b/functions/development/pyannote_audio/latest/static/documentation.html index e28a243b..c3132429 100644 --- a/functions/development/pyannote_audio/latest/static/documentation.html +++ b/functions/development/pyannote_audio/latest/static/documentation.html @@ -165,11 +165,7 @@

Contents

@@ -183,75 +179,8 @@

pyannote_audio package

Submodules#

-
-

pyannote_audio.pyannote_audio module#

-
-
-pyannote_audio.pyannote_audio.diarize(data_path: str | List[str], model_name: str = 'pyannote/speaker-diarization-3.0', access_token: str | None = None, device: str | None = None, speakers_labels: List[str] | None = None, speaker_prefix: str = 'speaker_', separate_by_channels: bool = False, minimum_speakers: int | None = None, maximum_speakers: int | None = None, verbose: bool = False) Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]][source]#
-

Perform speech diarization on given audio files using pyannote-audio (pyannote/pyannote-audio). -The end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list -of tuples: (start, end, speaker_label).

-

To use the pyannote.audio models you must pass a Huggingface token and get access to the required models. The -token can be passed in one of the following options:

-
    -
  • Use the parameter access_token.

  • -
  • Set an environment variable named “HUGGING_FACE_HUB_TOKEN”.

  • -
  • If using MLRun, you can pass it as a secret named “HUGGING_FACE_HUB_TOKEN”.

  • -
-

To get access to the models on Huggingface, visit their page. For example, to use the default diarization model set -in this function (“pyannote/speaker-diarization-3.0”), you need access for these two models:

- -

Note: To control the recognized speakers in the diarization output you can choose one of the following methods:

-
    -
  • For a known speakers amount, you may set speaker labels via the speakers_labels parameter that will be used in -the order of speaking in the audio (first person speaking be the first label in the list). In addition, you can do -diarization per channel (setting the parameter separate_by_channels to True). Each label will be assigned to a -specific channel by order (first label to channel 0, second label to channel 1 and so on). Notice, this will -increase runtime.

  • -
  • For unknown speakers amount, you can set the speaker_prefix parameter to add a prefix for each speaker number. -You can also help the diarization by setting the speakers range via the speakers_amount_range parameter.

  • -
-
-
Parameters:
-
    -
  • data_path – A directory of the audio files, a single file or a list of files to transcribe.

  • -
  • model_name – One of the official diarization model names (referred as diarization pipelines) of -pyannote.audio Huggingface page. Default: “pyannote/speaker-diarization-3.0”.

  • -
  • access_token – An access token to pass for using the pyannote.audio models. If not provided, it -will be looking for the environment variable “HUGGING_FACE_HUB_TOKEN”. If MLRun is -available, it will look for a secret “HUGGING_FACE_HUB_TOKEN”.

  • -
  • device – Device to load the model. Can be one of {“cuda”, “cpu”}. Default will prefer “cuda” if -available.

  • -
  • speakers_labels – Labels to use for the recognized speakers. Default: numeric labels (0, 1, …).

  • -
  • separate_by_channels – If each speaker is speaking in a separate channel, you can diarize each channel and -combine the result into a single diarization. Each label set in the speakers_labels -parameter will be assigned to a specific channel by order.

  • -
  • speaker_prefix – A prefix to add for the speakers labels. This parameter is ignored if -speakers_labels is not None. Default: “speaker”.

  • -
  • minimum_speakers – Set the minimum expected amount of speakers to be in the audio files. This parameter is -ignored if speakers_labels is not None.

  • -
  • maximum_speakers – Set the maximum expected amount of speakers to be in the audio files. This parameter is -ignored if speakers_labels is not None.

  • -
  • verbose – Whether to present logs of a progress bar and errors. Default: True.

  • -
-
-
Returns:
-

A tuple of:

-
    -
  • Speech diarization dictionary.

  • -
  • A dictionary of errored files that were not transcribed.

  • -
-

-
-
-
-
-
-pyannote_audio.pyannote_audio.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Dict[str, Any] | None = None)[source]#
-
+
+

pyannote_audio.pyannote_audio module#

Module contents#

@@ -271,11 +200,7 @@

Submodules diff --git a/functions/development/question_answering/0.5.0/static/documentation.html b/functions/development/question_answering/0.5.0/static/documentation.html index 602f654b..886e04cf 100644 --- a/functions/development/question_answering/0.5.0/static/documentation.html +++ b/functions/development/question_answering/0.5.0/static/documentation.html @@ -165,38 +165,7 @@

Contents

@@ -210,163 +179,8 @@

question_answering package

Submodules#

-
-

question_answering.question_answering module#

-
-
-class question_answering.question_answering.PollQuestionHandler(poll_count: int = 5, poll_strategy: str = 'most_common')[source]#
-

Bases: QuestionHandler

-

Static class to hold all the possible poll question configurations options keys

-
-
-class ConfigKeys[source]#
-

Bases: object

-

A class for handling questions answering for poll type questions. -These type of question are answered by asking the same question multiple times -and choosing the most common answer or the average answer.

-
-
-POLL_COUNT = 'poll_count'#
-

The number of times to ask the same question.

-
-
-
-POLL_STRATEGY = 'poll_strategy'#
-

The strategy to use for choosing the answer from the poll.

-
-
-
-
-class Strategy(value)[source]#
-

Bases: Enum

-

An enumeration.

-
-
-AVERAGE = 'average'#
-

The average answer strategy.

-
-
-
-MOST_COMMON = 'most_common'#
-

The most common answer strategy.

-
-
-
-static average(answers)[source]#
-

Calculate the average answer for a given list of answers.

-
-
-
-do(answers)[source]#
-

Perform the strategy.

-
-
-
-static most_common(answers)[source]#
-

Calculate the most common answer for a given list of answers.

-
-
-
-
-answer(questions_amount: int, batched_input: List[str], generation_pipeline: transformers.Pipeline, generation_config: transformers.GenerationConfig) List[List[str]][source]#
-

Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.

-
-
-
-
-class question_answering.question_answering.QuestionHandler[source]#
-

Bases: object

-

A class for handling questions answering for a given question type. -This class is used as a base class for all question types, and for default question type (regular question -answering without any special handling).

-
-
-class ConfigKeys[source]#
-

Bases: object

-
-
-
-answer(questions_amount: int, batched_input: List[str], generation_pipeline: transformers.Pipeline, generation_config: transformers.GenerationConfig) List[List[str]][source]#
-

Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.

-
-
-
-
-class question_answering.question_answering.QuestionTypes[source]#
-

Bases: object

-
-
-DEFAULT = 'default'#
-
-
-
-POLL = 'poll'#
-
-
-
-
-question_answering.question_answering.answer_questions(data_path: str | List[str], model_name: str, questions: List[str] | List[List[str]], device_map: str | dict | None = None, model_kwargs: dict | None = None, auto_gptq_exllama_max_input_length: int | None = None, tokenizer_name: str | None = None, tokenizer_kwargs: dict | None = None, text_wrapper: str | List[str] = '', questions_wrapper: str | List[str] = '', generation_config: Dict | List[Dict] | None = None, questions_config: Dict | List[Dict] | None = None, batch_size: int = 1, questions_columns: List[str] | None = None, verbose: bool = False) Tuple[DataFrame, dict][source]#
-

Answer questions with a context to the given text files contents by a pretrained LLM model. Each text file will have -the following prompt built:

-

start of text_wrapper -<text file content> -end of text_wrapper

-

start of questions_wrapper -1. <questions[0]> -2. <questions[1]> -… -n. <questions[n-1]> -end of questions_wrapper

-
-
Parameters:
-
    -
  • data_path – A path to a directory of text files or a path to a text file to ask -questions about.

  • -
  • model_name – The pre-trained model name from the huggingface hub to use for asking -questions.

  • -
  • questions – The questions to ask. -A list of lists of questions to ask per text file, and devided -by question groups, the groups can be dtermained by size (in order to -avoid large inputs to the llm) or by questioning method -(regular or poll like questioning).

  • -
  • device_map – A map to use for loading the model on multiple devices.

  • -
  • model_kwargs – Keyword arguments to pass for loading the model using HuggingFace’s -transformers.AutoModelForCausalLM.from_pretrained function.

  • -
  • auto_gptq_exllama_max_input_length – For AutoGPTQ models to set and extend the model’s input buffer size.

  • -
  • tokenizer_name – The tokenizer name from the huggingface hub to use. If not given, the -model name will be used.

  • -
  • tokenizer_kwargs – Keyword arguments to pass for loading the tokenizer using HuggingFace’s -transformers.AutoTokenizer.from_pretrained function.

  • -
  • text_wrapper – A wrapper for the file’s text. Will be added at the start of the prompt. -Must have a placeholder (‘{}’) for the text of the file.

  • -
  • questions_wrapper – A wrapper for the questions received. Will be added after the text -wrapper in the prompt template. Must have a placeholder (‘{}’) for the -questions.

  • -
  • generation_config – HuggingFace’s GenerationConfig keyword arguments to pass to the -generate method.

  • -
  • questions_config – A dictionary or list of dictionaries containing specific ways to answer -questions (using a poll for example), each dictionary in the list is for -corresponding question group and determines the question asking method -for said group.

  • -
  • batch_size – Batch size for inference.

  • -
  • questions_columns – Columns to use for the dataframe returned.

  • -
  • verbose – Whether to present logs of a progress bar and errors. Default: True.

  • -
-
-
Returns:
-

A tuple of:

-
    -
  • A dataframe dataset of the questions answers.

  • -
  • A dictionary of errored files that were not inferred or were not answered properly.

  • -
-

-
-
-
-
-
-question_answering.question_answering.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Dict[str, Any] | None = None)[source]#
-
+
+

question_answering.question_answering module#

Module contents#

@@ -386,38 +200,7 @@

Submodules diff --git a/functions/development/question_answering/latest/static/documentation.html b/functions/development/question_answering/latest/static/documentation.html index 602f654b..886e04cf 100644 --- a/functions/development/question_answering/latest/static/documentation.html +++ b/functions/development/question_answering/latest/static/documentation.html @@ -165,38 +165,7 @@

Contents

@@ -210,163 +179,8 @@

question_answering package

Submodules#

-
-

question_answering.question_answering module#

-
-
-class question_answering.question_answering.PollQuestionHandler(poll_count: int = 5, poll_strategy: str = 'most_common')[source]#
-

Bases: QuestionHandler

-

Static class to hold all the possible poll question configurations options keys

-
-
-class ConfigKeys[source]#
-

Bases: object

-

A class for handling questions answering for poll type questions. -These type of question are answered by asking the same question multiple times -and choosing the most common answer or the average answer.

-
-
-POLL_COUNT = 'poll_count'#
-

The number of times to ask the same question.

-
-
-
-POLL_STRATEGY = 'poll_strategy'#
-

The strategy to use for choosing the answer from the poll.

-
-
-
-
-class Strategy(value)[source]#
-

Bases: Enum

-

An enumeration.

-
-
-AVERAGE = 'average'#
-

The average answer strategy.

-
-
-
-MOST_COMMON = 'most_common'#
-

The most common answer strategy.

-
-
-
-static average(answers)[source]#
-

Calculate the average answer for a given list of answers.

-
-
-
-do(answers)[source]#
-

Perform the strategy.

-
-
-
-static most_common(answers)[source]#
-

Calculate the most common answer for a given list of answers.

-
-
-
-
-answer(questions_amount: int, batched_input: List[str], generation_pipeline: transformers.Pipeline, generation_config: transformers.GenerationConfig) List[List[str]][source]#
-

Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.

-
-
-
-
-class question_answering.question_answering.QuestionHandler[source]#
-

Bases: object

-

A class for handling questions answering for a given question type. -This class is used as a base class for all question types, and for default question type (regular question -answering without any special handling).

-
-
-class ConfigKeys[source]#
-

Bases: object

-
-
-
-answer(questions_amount: int, batched_input: List[str], generation_pipeline: transformers.Pipeline, generation_config: transformers.GenerationConfig) List[List[str]][source]#
-

Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.

-
-
-
-
-class question_answering.question_answering.QuestionTypes[source]#
-

Bases: object

-
-
-DEFAULT = 'default'#
-
-
-
-POLL = 'poll'#
-
-
-
-
-question_answering.question_answering.answer_questions(data_path: str | List[str], model_name: str, questions: List[str] | List[List[str]], device_map: str | dict | None = None, model_kwargs: dict | None = None, auto_gptq_exllama_max_input_length: int | None = None, tokenizer_name: str | None = None, tokenizer_kwargs: dict | None = None, text_wrapper: str | List[str] = '', questions_wrapper: str | List[str] = '', generation_config: Dict | List[Dict] | None = None, questions_config: Dict | List[Dict] | None = None, batch_size: int = 1, questions_columns: List[str] | None = None, verbose: bool = False) Tuple[DataFrame, dict][source]#
-

Answer questions with a context to the given text files contents by a pretrained LLM model. Each text file will have -the following prompt built:

-

start of text_wrapper -<text file content> -end of text_wrapper

-

start of questions_wrapper -1. <questions[0]> -2. <questions[1]> -… -n. <questions[n-1]> -end of questions_wrapper

-
-
Parameters:
-
    -
  • data_path – A path to a directory of text files or a path to a text file to ask -questions about.

  • -
  • model_name – The pre-trained model name from the huggingface hub to use for asking -questions.

  • -
  • questions – The questions to ask. -A list of lists of questions to ask per text file, and devided -by question groups, the groups can be dtermained by size (in order to -avoid large inputs to the llm) or by questioning method -(regular or poll like questioning).

  • -
  • device_map – A map to use for loading the model on multiple devices.

  • -
  • model_kwargs – Keyword arguments to pass for loading the model using HuggingFace’s -transformers.AutoModelForCausalLM.from_pretrained function.

  • -
  • auto_gptq_exllama_max_input_length – For AutoGPTQ models to set and extend the model’s input buffer size.

  • -
  • tokenizer_name – The tokenizer name from the huggingface hub to use. If not given, the -model name will be used.

  • -
  • tokenizer_kwargs – Keyword arguments to pass for loading the tokenizer using HuggingFace’s -transformers.AutoTokenizer.from_pretrained function.

  • -
  • text_wrapper – A wrapper for the file’s text. Will be added at the start of the prompt. -Must have a placeholder (‘{}’) for the text of the file.

  • -
  • questions_wrapper – A wrapper for the questions received. Will be added after the text -wrapper in the prompt template. Must have a placeholder (‘{}’) for the -questions.

  • -
  • generation_config – HuggingFace’s GenerationConfig keyword arguments to pass to the -generate method.

  • -
  • questions_config – A dictionary or list of dictionaries containing specific ways to answer -questions (using a poll for example), each dictionary in the list is for -corresponding question group and determines the question asking method -for said group.

  • -
  • batch_size – Batch size for inference.

  • -
  • questions_columns – Columns to use for the dataframe returned.

  • -
  • verbose – Whether to present logs of a progress bar and errors. Default: True.

  • -
-
-
Returns:
-

A tuple of:

-
    -
  • A dataframe dataset of the questions answers.

  • -
  • A dictionary of errored files that were not inferred or were not answered properly.

  • -
-

-
-
-
-
-
-question_answering.question_answering.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Dict[str, Any] | None = None)[source]#
-
+
+

question_answering.question_answering module#

Module contents#

@@ -386,38 +200,7 @@

Submodules diff --git a/functions/development/silero_vad/1.4.0/static/documentation.html b/functions/development/silero_vad/1.4.0/static/documentation.html index 344c2421..17866937 100644 --- a/functions/development/silero_vad/1.4.0/static/documentation.html +++ b/functions/development/silero_vad/1.4.0/static/documentation.html @@ -165,33 +165,7 @@

Contents

@@ -205,267 +179,8 @@

silero_vad package

Submodules#

-
-

silero_vad.silero_vad module#

-
-
-class silero_vad.silero_vad.BaseTask(audio_file: Path)[source]#
-

Bases: object

-

A base class for a task to complete after VAD.

-
-
-property audio_file: Path#
-

Get the audio file of the task.

-
-
Returns:
-

The audio file of the task.

-
-
-
-
-
-do_task(speech_timestamps: List[Dict[str, int]] | List[List[Dict[str, int]]])[source]#
-

Do the task on the given speech timestamps. The base task will simply save the speech timestamps as the result.

-
-
Parameters:
-

speech_timestamps – The speech timestamps to do the task on as outputted from the VAD.

-
-
-
-
-
-get_result() Tuple[str, list][source]#
-

Get the result of the task. A tuple of the audio file name and the result.

-
-
Returns:
-

The result of the task.

-
-
-
-
-
-to_tuple() Tuple[str, dict][source]#
-

Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

-
-
Returns:
-

The converted task.

-
-
-
-
-
-
-class silero_vad.silero_vad.SpeechDiarizationTask(audio_file: Path, speaker_labels: List[str])[source]#
-

Bases: BaseTask

-

A speech diarization task. The task will diarize the VAD speech timestamps into speakers.

-
-
-do_task(speech_timestamps: List[List[Dict[str, int]]])[source]#
-

Do the task on the given speech timestamps. The task will diarize the VAD speech timestamps into speakers.

-
-
Parameters:
-

speech_timestamps – The speech timestamps per channel to do the task on as outputted from the VAD.

-
-
-
-
-
-to_tuple() Tuple[str, dict][source]#
-

Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

-
-
Returns:
-

The converted task.

-
-
-
-
-
-
-class silero_vad.silero_vad.TaskCreator(task_type: Type[BaseTask], task_kwargs: dict | None = None)[source]#
-

Bases: object

-

A task creator to create different tasks to run after the VAD.

-
-
-create_task(audio_file: Path) BaseTask[source]#
-

Create a task with the given audio file.

-
-
Parameters:
-

audio_file – The audio file to assign to the task.

-
-
Returns:
-

The created task.

-
-
-
-
-
-classmethod from_tuple(task_tuple: Tuple[str, dict]) BaseTask[source]#
-

Create a task from a tuple of the audio file name and the task kwargs.

-
-
Parameters:
-

task_tuple – The task tuple to create the task from.

-
-
Returns:
-

The created task.

-
-
-
-
-
-
-class silero_vad.silero_vad.VoiceActivityDetector(use_onnx: bool = True, force_onnx_cpu: bool = True, threshold: float = 0.5, sampling_rate: int = 16000, min_speech_duration_ms: int = 250, max_speech_duration_s: float = inf, min_silence_duration_ms: int = 100, window_size_samples: int = 512, speech_pad_ms: int = 30, return_seconds: bool = False, per_channel: bool = False)[source]#
-

Bases: object

-

A voice activity detection wrapper for the silero VAD model - snakers4/silero-vad.

-
-
-detect_voice(audio_file: Path) List[Dict[str, int]] | List[List[Dict[str, int]]][source]#
-

Infer the audio through the VAD model and return the speech timestamps.

-
-
Parameters:
-

audio_file – The audio file to infer.

-
-
Returns:
-

The speech timestamps in the audio. A list of timestamps where each timestamp is a dictionary with the -following keys:

-
    -
  • ”start”: The start sample index of the speech in the audio.

  • -
  • ”end”: The end sample index of the speech in the audio.

  • -
-

If per_channel is True, a list of timestamps per channel will be returned.

-

-
-
-
-
-
-load(force_reload: bool = True)[source]#
-

Load the VAD model.

-
-
Parameters:
-

force_reload – Whether to force reload the model even if it was already loaded. Default is True.

-
-
-
-
-
-
-silero_vad.silero_vad.detect_voice(data_path: str | Path | List[str | Path], use_onnx: bool = True, force_onnx_cpu: bool = True, threshold: float = 0.5, sampling_rate: int = 16000, min_speech_duration_ms: int = 250, max_speech_duration_s: float = inf, min_silence_duration_ms: int = 100, window_size_samples: int = 512, speech_pad_ms: int = 30, return_seconds: bool = False, per_channel: bool = False, use_multiprocessing: int = 0, verbose: bool = False)[source]#
-

Perform voice activity detection on given audio files using the silero VAD model - -snakers4/silero-vad. The end result is a dictionary with the file names as keys and their -VAD timestamps dictionaries as value.

-

For example:

-
{
-    "file_1.wav": [
-        {"start": 0, "end": 16000},
-        {"start": 16000, "end": 32000},
-        {"start": 32000, "end": 48000},
-        ...
-    ],
-    "file_2.wav": [
-        {"start": 0, "end": 16000},
-        {"start": 16000, "end": 32000},
-        {"start": 32000, "end": 48000},
-        ...
-    ],
-    ...
-}
-
-
-
-
Parameters:
-
    -
  • data_path – The path to the audio files to diarize. Can be a path to a single file, a path to a -directory or a list of paths to files.

  • -
  • use_onnx – Whether to use ONNX for inference. Default is True.

  • -
  • force_onnx_cpu – Whether to force ONNX to use CPU for inference. Default is True.

  • -
  • threshold – Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, -probabilities ABOVE this value are considered as SPEECH. It is better to tune -this parameter for each dataset separately, but “lazy” 0.5 is pretty good for -most datasets.

  • -
  • sampling_rate – Currently, silero VAD models support 8000 and 16000 sample rates.

  • -
  • min_speech_duration_ms – Final speech chunks shorter min_speech_duration_ms are thrown out.

  • -
  • max_speech_duration_s – Maximum duration of speech chunks in seconds. Chunks longer than -max_speech_duration_s will be split at the timestamp of the last silence that -lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will -be split aggressively just before max_speech_duration_s.

  • -
  • min_silence_duration_ms – In the end of each speech chunk wait for min_silence_duration_ms before separating -it.

  • -
  • window_size_samples

    Audio chunks of window_size_samples size are fed to the silero VAD model.

    -

    WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 -sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than -these may affect model performance!

    -

  • -
  • speech_pad_ms – Final speech chunks are padded by speech_pad_ms each side.

  • -
  • return_seconds – Whether return timestamps in seconds. False means to return timestamps in samples -(default - False).

  • -
  • per_channel – Whether to return timestamps per channel (default - False). This will run VAD on -each channel separately and return a list of timestamps per channel.

  • -
  • use_multiprocessing – The number of workers to use for multiprocessing. If 0, no multiprocessing will -be used. Default is 0.

  • -
  • verbose – Verbosity.

  • -
-
-
-
-
-
-silero_vad.silero_vad.diarize(data_path: str | Path | List[str | Path], use_onnx: bool = True, force_onnx_cpu: bool = True, threshold: float = 0.5, sampling_rate: int = 16000, min_speech_duration_ms: int = 250, max_speech_duration_s: float = inf, min_silence_duration_ms: int = 100, window_size_samples: int = 512, speech_pad_ms: int = 30, speaker_labels: List[str] | None = None, use_multiprocessing: int = 0, verbose: bool = False)[source]#
-

Perform speech diarization on given audio files using the silero VAD model - snakers4/silero-vad. -The speech diarization is performed per channel so that each channel in the audio belong to a different speaker. The -end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list -of tuples: (start, end, speaker_label).

-

For example:

-
{
-    "file_1.wav": [
-        (0.0, 1.0, "speaker_0"),
-        (1.0, 2.0, "speaker_1"),
-        (2.0, 3.0, "speaker_0"),
-        ...
-    ],
-    "file_2.wav": [
-        (0.0, 1.0, "speaker_0"),
-        (1.0, 2.0, "speaker_1"),
-        (2.0, 3.0, "speaker_0"),
-        ...
-    ],
-    ...
-}
-
-
-
-
Parameters:
-
    -
  • data_path – The path to the audio files to diarize. Can be a path to a single file, a path to a -directory or a list of paths to files.

  • -
  • use_onnx – Whether to use ONNX for inference. Default is True.

  • -
  • force_onnx_cpu – Whether to force ONNX to use CPU for inference. Default is True.

  • -
  • threshold – Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, -probabilities ABOVE this value are considered as SPEECH. It is better to tune -this parameter for each dataset separately, but “lazy” 0.5 is pretty good for -most datasets.

  • -
  • sampling_rate – Currently, silero VAD models support 8000 and 16000 sample rates.

  • -
  • min_speech_duration_ms – Final speech chunks shorter min_speech_duration_ms are thrown out.

  • -
  • max_speech_duration_s – Maximum duration of speech chunks in seconds. Chunks longer than -max_speech_duration_s will be split at the timestamp of the last silence that -lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will -be split aggressively just before max_speech_duration_s.

  • -
  • min_silence_duration_ms – In the end of each speech chunk wait for min_silence_duration_ms before separating -it.

  • -
  • window_size_samples

    Audio chunks of window_size_samples size are fed to the silero VAD model.

    -

    WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 -sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than -these may affect model performance!

    -

  • -
  • speech_pad_ms – Final speech chunks are padded by speech_pad_ms each side.

  • -
  • speaker_labels – The speaker labels to use for the diarization. If not given, the speakers will be -named “speaker_0”, “speaker_1”, etc.

  • -
  • use_multiprocessing – The number of workers to use for multiprocessing. If 0, no multiprocessing will -be used. Default is 0.

  • -
  • verbose – Verbosity.

  • -
-
-
-
+
+

silero_vad.silero_vad module#

Module contents#

@@ -485,33 +200,7 @@

Submodules diff --git a/functions/development/silero_vad/latest/static/documentation.html b/functions/development/silero_vad/latest/static/documentation.html index 344c2421..17866937 100644 --- a/functions/development/silero_vad/latest/static/documentation.html +++ b/functions/development/silero_vad/latest/static/documentation.html @@ -165,33 +165,7 @@

Contents

@@ -205,267 +179,8 @@

silero_vad package

Submodules#

-
-

silero_vad.silero_vad module#

-
-
-class silero_vad.silero_vad.BaseTask(audio_file: Path)[source]#
-

Bases: object

-

A base class for a task to complete after VAD.

-
-
-property audio_file: Path#
-

Get the audio file of the task.

-
-
Returns:
-

The audio file of the task.

-
-
-
-
-
-do_task(speech_timestamps: List[Dict[str, int]] | List[List[Dict[str, int]]])[source]#
-

Do the task on the given speech timestamps. The base task will simply save the speech timestamps as the result.

-
-
Parameters:
-

speech_timestamps – The speech timestamps to do the task on as outputted from the VAD.

-
-
-
-
-
-get_result() Tuple[str, list][source]#
-

Get the result of the task. A tuple of the audio file name and the result.

-
-
Returns:
-

The result of the task.

-
-
-
-
-
-to_tuple() Tuple[str, dict][source]#
-

Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

-
-
Returns:
-

The converted task.

-
-
-
-
-
-
-class silero_vad.silero_vad.SpeechDiarizationTask(audio_file: Path, speaker_labels: List[str])[source]#
-

Bases: BaseTask

-

A speech diarization task. The task will diarize the VAD speech timestamps into speakers.

-
-
-do_task(speech_timestamps: List[List[Dict[str, int]]])[source]#
-

Do the task on the given speech timestamps. The task will diarize the VAD speech timestamps into speakers.

-
-
Parameters:
-

speech_timestamps – The speech timestamps per channel to do the task on as outputted from the VAD.

-
-
-
-
-
-to_tuple() Tuple[str, dict][source]#
-

Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

-
-
Returns:
-

The converted task.

-
-
-
-
-
-
-class silero_vad.silero_vad.TaskCreator(task_type: Type[BaseTask], task_kwargs: dict | None = None)[source]#
-

Bases: object

-

A task creator to create different tasks to run after the VAD.

-
-
-create_task(audio_file: Path) BaseTask[source]#
-

Create a task with the given audio file.

-
-
Parameters:
-

audio_file – The audio file to assign to the task.

-
-
Returns:
-

The created task.

-
-
-
-
-
-classmethod from_tuple(task_tuple: Tuple[str, dict]) BaseTask[source]#
-

Create a task from a tuple of the audio file name and the task kwargs.

-
-
Parameters:
-

task_tuple – The task tuple to create the task from.

-
-
Returns:
-

The created task.

-
-
-
-
-
-
-class silero_vad.silero_vad.VoiceActivityDetector(use_onnx: bool = True, force_onnx_cpu: bool = True, threshold: float = 0.5, sampling_rate: int = 16000, min_speech_duration_ms: int = 250, max_speech_duration_s: float = inf, min_silence_duration_ms: int = 100, window_size_samples: int = 512, speech_pad_ms: int = 30, return_seconds: bool = False, per_channel: bool = False)[source]#
-

Bases: object

-

A voice activity detection wrapper for the silero VAD model - snakers4/silero-vad.

-
-
-detect_voice(audio_file: Path) List[Dict[str, int]] | List[List[Dict[str, int]]][source]#
-

Infer the audio through the VAD model and return the speech timestamps.

-
-
Parameters:
-

audio_file – The audio file to infer.

-
-
Returns:
-

The speech timestamps in the audio. A list of timestamps where each timestamp is a dictionary with the -following keys:

-
    -
  • ”start”: The start sample index of the speech in the audio.

  • -
  • ”end”: The end sample index of the speech in the audio.

  • -
-

If per_channel is True, a list of timestamps per channel will be returned.

-

-
-
-
-
-
-load(force_reload: bool = True)[source]#
-

Load the VAD model.

-
-
Parameters:
-

force_reload – Whether to force reload the model even if it was already loaded. Default is True.

-
-
-
-
-
-
-silero_vad.silero_vad.detect_voice(data_path: str | Path | List[str | Path], use_onnx: bool = True, force_onnx_cpu: bool = True, threshold: float = 0.5, sampling_rate: int = 16000, min_speech_duration_ms: int = 250, max_speech_duration_s: float = inf, min_silence_duration_ms: int = 100, window_size_samples: int = 512, speech_pad_ms: int = 30, return_seconds: bool = False, per_channel: bool = False, use_multiprocessing: int = 0, verbose: bool = False)[source]#
-

Perform voice activity detection on given audio files using the silero VAD model - -snakers4/silero-vad. The end result is a dictionary with the file names as keys and their -VAD timestamps dictionaries as value.

-

For example:

-
{
-    "file_1.wav": [
-        {"start": 0, "end": 16000},
-        {"start": 16000, "end": 32000},
-        {"start": 32000, "end": 48000},
-        ...
-    ],
-    "file_2.wav": [
-        {"start": 0, "end": 16000},
-        {"start": 16000, "end": 32000},
-        {"start": 32000, "end": 48000},
-        ...
-    ],
-    ...
-}
-
-
-
-
Parameters:
-
    -
  • data_path – The path to the audio files to diarize. Can be a path to a single file, a path to a -directory or a list of paths to files.

  • -
  • use_onnx – Whether to use ONNX for inference. Default is True.

  • -
  • force_onnx_cpu – Whether to force ONNX to use CPU for inference. Default is True.

  • -
  • threshold – Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, -probabilities ABOVE this value are considered as SPEECH. It is better to tune -this parameter for each dataset separately, but “lazy” 0.5 is pretty good for -most datasets.

  • -
  • sampling_rate – Currently, silero VAD models support 8000 and 16000 sample rates.

  • -
  • min_speech_duration_ms – Final speech chunks shorter min_speech_duration_ms are thrown out.

  • -
  • max_speech_duration_s – Maximum duration of speech chunks in seconds. Chunks longer than -max_speech_duration_s will be split at the timestamp of the last silence that -lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will -be split aggressively just before max_speech_duration_s.

  • -
  • min_silence_duration_ms – In the end of each speech chunk wait for min_silence_duration_ms before separating -it.

  • -
  • window_size_samples

    Audio chunks of window_size_samples size are fed to the silero VAD model.

    -

    WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 -sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than -these may affect model performance!

    -

  • -
  • speech_pad_ms – Final speech chunks are padded by speech_pad_ms each side.

  • -
  • return_seconds – Whether return timestamps in seconds. False means to return timestamps in samples -(default - False).

  • -
  • per_channel – Whether to return timestamps per channel (default - False). This will run VAD on -each channel separately and return a list of timestamps per channel.

  • -
  • use_multiprocessing – The number of workers to use for multiprocessing. If 0, no multiprocessing will -be used. Default is 0.

  • -
  • verbose – Verbosity.

  • -
-
-
-
-
-
-silero_vad.silero_vad.diarize(data_path: str | Path | List[str | Path], use_onnx: bool = True, force_onnx_cpu: bool = True, threshold: float = 0.5, sampling_rate: int = 16000, min_speech_duration_ms: int = 250, max_speech_duration_s: float = inf, min_silence_duration_ms: int = 100, window_size_samples: int = 512, speech_pad_ms: int = 30, speaker_labels: List[str] | None = None, use_multiprocessing: int = 0, verbose: bool = False)[source]#
-

Perform speech diarization on given audio files using the silero VAD model - snakers4/silero-vad. -The speech diarization is performed per channel so that each channel in the audio belong to a different speaker. The -end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list -of tuples: (start, end, speaker_label).

-

For example:

-
{
-    "file_1.wav": [
-        (0.0, 1.0, "speaker_0"),
-        (1.0, 2.0, "speaker_1"),
-        (2.0, 3.0, "speaker_0"),
-        ...
-    ],
-    "file_2.wav": [
-        (0.0, 1.0, "speaker_0"),
-        (1.0, 2.0, "speaker_1"),
-        (2.0, 3.0, "speaker_0"),
-        ...
-    ],
-    ...
-}
-
-
-
-
Parameters:
-
    -
  • data_path – The path to the audio files to diarize. Can be a path to a single file, a path to a -directory or a list of paths to files.

  • -
  • use_onnx – Whether to use ONNX for inference. Default is True.

  • -
  • force_onnx_cpu – Whether to force ONNX to use CPU for inference. Default is True.

  • -
  • threshold – Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, -probabilities ABOVE this value are considered as SPEECH. It is better to tune -this parameter for each dataset separately, but “lazy” 0.5 is pretty good for -most datasets.

  • -
  • sampling_rate – Currently, silero VAD models support 8000 and 16000 sample rates.

  • -
  • min_speech_duration_ms – Final speech chunks shorter min_speech_duration_ms are thrown out.

  • -
  • max_speech_duration_s – Maximum duration of speech chunks in seconds. Chunks longer than -max_speech_duration_s will be split at the timestamp of the last silence that -lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will -be split aggressively just before max_speech_duration_s.

  • -
  • min_silence_duration_ms – In the end of each speech chunk wait for min_silence_duration_ms before separating -it.

  • -
  • window_size_samples

    Audio chunks of window_size_samples size are fed to the silero VAD model.

    -

    WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 -sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than -these may affect model performance!

    -

  • -
  • speech_pad_ms – Final speech chunks are padded by speech_pad_ms each side.

  • -
  • speaker_labels – The speaker labels to use for the diarization. If not given, the speakers will be -named “speaker_0”, “speaker_1”, etc.

  • -
  • use_multiprocessing – The number of workers to use for multiprocessing. If 0, no multiprocessing will -be used. Default is 0.

  • -
  • verbose – Verbosity.

  • -
-
-
-
+
+

silero_vad.silero_vad module#

Module contents#

@@ -485,33 +200,7 @@

Submodules diff --git a/functions/development/tags.json b/functions/development/tags.json index e3b61907..fd886ae7 100644 --- a/functions/development/tags.json +++ b/functions/development/tags.json @@ -1 +1 @@ -{"categories": ["data-analysis", "data-generation", "model-training", "genai", "NLP", "model-testing", "utils", "monitoring", "deep-learning", "model-serving", "machine-learning", "audio", "data-preparation"], "kind": ["nuclio:serving", "job", "serving"]} \ No newline at end of file +{"kind": ["nuclio:serving", "job", "serving"], "categories": ["model-serving", "data-analysis", "audio", "deep-learning", "model-training", "utils", "genai", "model-testing", "machine-learning", "data-generation", "monitoring", "data-preparation", "NLP"]} \ No newline at end of file diff --git a/functions/development/transcribe/1.2.0/static/documentation.html b/functions/development/transcribe/1.2.0/static/documentation.html index d92df103..edbea1b6 100644 --- a/functions/development/transcribe/1.2.0/static/documentation.html +++ b/functions/development/transcribe/1.2.0/static/documentation.html @@ -165,48 +165,7 @@

Contents

@@ -220,323 +179,8 @@

transcribe package

Submodules#

-
-

transcribe.transcribe module#

-
-
-class transcribe.transcribe.BaseTask(audio_file: Path, transcription_output: dict | str, text_file: Path)[source]#
-

Bases: object

-

A task to write the transcription to file.

-
-
-do_task()[source]#
-

Try to perform the task storing an error if occurred.

-
-
-
-get_result() Tuple[str, str][source]#
-

Get the result of the task. If the task failed, the error will be returned, otherwise, the result will be the -text file name.

-
-
Returns:
-

The task’s result.

-
-
-
-
-
-is_failed() bool[source]#
-

Check if the task failed.

-
-
Returns:
-

Whether the task failed.

-
-
-
-
-
-to_tuple() Tuple[str, dict][source]#
-

Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

-
-
Returns:
-

The converted task.

-
-
-
-
-
-
-class transcribe.transcribe.BatchProcessor(audio_files: List[Path], output_directory: Path)[source]#
-

Bases: object

-

A batch processor to process batches of transcriptions. The batch processor is creating tasks and is aimed to be -working along the transcriber. It can be used with multiprocessing queue or run the tasks directly using the -associated methods.

-
-
-do_tasks()[source]#
-

Perform the tasks. Should be used if no multiprocessing queue is given to a transcriber.

-
-
-
-get_results() List[Tuple[bool, Tuple[str, str]]][source]#
-

Get the results of the tasks. The stored results are then cleared.

-
-
Returns:
-

The results of the tasks.

-
-
-
-
-
-get_tasks() List[BaseTask][source]#
-

Get the tasks to perform.

-
-
Returns:
-

The tasks to perform.

-
-
-
-
-
-process_batch(batch: List[dict | str])[source]#
-

Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch -processor.

-
-
Parameters:
-

batch – The batch of transcriptions to process.

-
-
-
-
-
-
-class transcribe.transcribe.PerChannelSpeechDiarizationBatchProcessor(audio_files: List[Path], output_directory: Path, n_channels: int, speakers: List[str])[source]#
-

Bases: BatchProcessor

-

A batch processor to process batches of transcriptions per channel. The batch processor is creating tasks with the -selected amount of channels given and is aimed to be working along the transcriber. It can be used with -multiprocessing queue or run the tasks directly using the associated methods.

-
-
-process_batch(batch: List[dict])[source]#
-

Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch -processor.

-
-
Parameters:
-

batch – The batch of transcriptions to process.

-
-
-
-
-
-
-class transcribe.transcribe.SpeechDiarizationBatchProcessor(audio_files: List[Path], output_directory: Path, speech_diarization: dict)[source]#
-

Bases: BatchProcessor

-

A batch processor to process batches of transcriptions with respect to a given speech diarization. The batch -processor is creating tasks and is aimed to be working along the transcriber. It can be used with multiprocessing -queue or run the tasks directly using the associated methods.

-
-
-process_batch(batch: List[dict])[source]#
-

Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch -processor.

-
-
Parameters:
-

batch – The batch of transcriptions to process.

-
-
-
-
-
-
-class transcribe.transcribe.SpeechDiarizationPerChannelTask(audio_file: Path, text_file: Path)[source]#
-

Bases: BaseTask

-

A task to write the transcription to file with respect to a given speech diarization per channel.

-
-
-do_task()[source]#
-

Try to perform the task storing an error if occurred.

-
-
-
-to_tuple() Tuple[str, dict][source]#
-

Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

-
-
Returns:
-

The converted task.

-
-
-
-
-
-property transcription_output_channels: List[Tuple[str, dict]]#
-

Get the transcription output channels.

-
-
Returns:
-

The transcription output channels.

-
-
-
-
-
-
-class transcribe.transcribe.SpeechDiarizationTask(audio_file: Path, transcription_output: dict, text_file: Path, speech_diarization: List[Tuple[float, float, str]])[source]#
-

Bases: BaseTask

-

A task to write the transcription to file with respect to a given speech diarization.

-
-
-to_tuple() Tuple[str, dict][source]#
-

Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

-
-
Returns:
-

The converted task.

-
-
-
-
-
-
-class transcribe.transcribe.Transcriber(model_name: str, device: str | None = None, use_flash_attention_2: bool | None = None, use_better_transformers: bool | None = None, assistant_model: str | None = None, max_new_tokens: int = 128, chunk_length_s: int = 30, batch_size: int = 2, spoken_language: str | None = None, translate_to_english: bool = False, return_timestamps: bool | Literal['word'] = False, per_channel_transcription: int = 0)[source]#
-

Bases: object

-

A transcription wrapper for the Huggingface’s ASR pipeline - -https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline to -use with OpenAI’s Whisper models - https://huggingface.co/openai.

-
-
-load()[source]#
-

Load the transcriber. Must be called before transcribing.

-
-
-
-transcribe(audio_files: List[Path], batch_processor: BatchProcessor | None = None, batches_queue: Queue | None = None, verbose: bool = False) List[List[dict]] | None[source]#
-

Transcribe the given audio files. The transcriptions will be sent to a queue or a batch processor for further -processing like writing to text files. If no queue or batch processor is given, the transcriptions outputs from -the pipeline will be returned. Otherwise, None is returned.

-
-
Parameters:
-
    -
  • audio_files – The audio files to transcribe.

  • -
  • batch_processor – A batch processor.

  • -
  • batches_queue – A multiprocessing queue to put the batches in.

  • -
  • verbose – Whether to show a progress bar. Default is False.

  • -
-
-
Returns:
-

The transcriptions outputs from the pipeline if no queue or batch processor is given, otherwise, -None.

-
-
-
-
-
-
-transcribe.transcribe.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Dict[str, Any] | None = None)[source]#
-
-
-
-transcribe.transcribe.transcribe(data_path: str | Path | List[str | Path], output_directory: str | None = None, model_name: str = 'openai/whisper-tiny', device: str | None = None, use_flash_attention_2: bool | None = None, use_better_transformers: bool | None = None, assistant_model: str | None = None, max_new_tokens: int = 128, chunk_length_s: int = 30, batch_size: int = 8, spoken_language: str | None = None, translate_to_english: bool = False, speech_diarization: Dict[str, List[Tuple[float, float, str]]] | None = None, speech_diarize_per_channel: int | None = None, speaker_labels: List[str] | None = None, use_multiprocessing: bool | int = False, verbose: bool = False)[source]#
-

Transcribe audio files into text files and collect additional data. The end result is a directory of transcribed -text files and a dataframe containing the following columns:

-
    -
  • audio_file - The audio file path.

  • -
  • transcription_file - The transcribed text file name in the output directory.

  • -
-

The transcription is based on Huggingface’s ASR pipeline - -https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline and -is tested with OpenAI’s Whisper models - https://huggingface.co/openai.

-

If one of the speaker diarization parameters are given (either speech_diarization or -speech_diarize_per_channel), the transcription will be written in a conversation format, where each speaker will -be written in a separate line:

-
speaker_1: text
-speaker_2: text
-speaker_1: text
-...
-
-
-
-
Parameters:
-
    -
  • data_path – A directory of audio files or a single file or a list of files to transcribe.

  • -
  • output_directory – Path to a directory to save all transcribed audio files. If not given, will save -the transcribed files in a temporary directory.

  • -
  • model_name – The model name to use. Should be a model from the OpenAI’s Whisper models for -best results (for example “tiny”, “base”, “large”, etc.). See here for more -information: https://huggingface.co/openai?search_models=whisper.

  • -
  • device – The device to use for inference. If not given, will use GPU if available.

  • -
  • use_flash_attention_2

    Whether to use the Flash Attention 2 implementation. It can be used only with -one of the following GPUs: Nvidia H series and Nvidia A series. T4 support -will be available soon.

    -

    Note: If both use_flash_attention_2 and -use_better_transformers are None, the optimization will be chosen -automatically according to the available resources.

    -

  • -
  • use_better_transformers

    Whether to use the Better Transformers library to further optimize the model. -Should be used for all use cases that do not support flash attention 2.

    -

    Note: If both use_flash_attention_2 and use_better_transformers are -None, the optimization will be chosen automatically according to the -available resources.

    -

  • -
  • assistant_model

    The assistant model name to use for inference. Notice that the optimizations -(flash attention 2 and better transformers) will be applied for the assistant as -well. Should be a model from Huggingface’s distil-whisper (see here for more -information: huggingface/distil-whisper).

    -

    Note: Currently an assistant model is only usable with batch size of 1.

    -

  • -
  • max_new_tokens – The maximum number of new tokens to generate. This is used to limit the -generation length. Default is 128 tokens.

  • -
  • chunk_length_s – The audio chunk to split the audio to (in seconds). Default is 30 seconds.

  • -
  • batch_size – The batch size to use for inference. Default is 2.

  • -
  • spoken_language – Aim whisper to know what language is spoken. If None, it will try to detect -it.

  • -
  • translate_to_english – Whether to translate the transcriptions to English.

  • -
  • speech_diarization

    A speech diarization dictionary with the file names to transcribe as keys and -their diarization as value. The diarization is a list of tuples: -(start, end, speaker). An example -for a diarization dictionary:

    -
    {
    -
    -
    -
    -
    -
    ”audio_file_name”: [
    -
    {

    “start”: 0.0, -“end”: 2.0, -“speaker”: “Agent”,

    -
    -
    -

    }, -{

    -
    -

    ”start”: 2.0, -“end”: 4.0, -“speaker”: “Client”,

    -
    -
    -
    -
    -

    }

    -

    Note: The diarization must be for the entire duration of the audio file (as long -as Whisper is predicting words up until then.

    -

  • -
  • speech_diarize_per_channel – Perform speech diarization per channel. Each speaker is expected to belong to -a separate channel in the audio. Notice: This will make the transcription -slower as each channel wil be transcribed separatly. If a speech diarization -is passed (via the speech_diarization parameter), this parameter is -ignored.

  • -
  • speaker_labels – A list of speaker labels by channel order to use for writing the -transcription with respect to per channel speech diarization. This won’t be -used together with a given speech diarization (via the speech_diarization -parameter).

  • -
  • use_multiprocessing – Whether to use multiprocessing to transcribe the audio files. Can be either a -boolean value or an integer. If True, will use the default amount of workers -(3): 1 for transcription, 1 for batch processing and 1 for task completion (such -as speech diarization and writing to files). To control the amount of tasks -completion workers, an integer can be provided to specify the amount of workers. -False, will use a single process. Default is False.

  • -
  • verbose – Whether to print the progress of the transcription. Default is False.

  • -
-
-
-
+
+

transcribe.transcribe module#

Module contents#

@@ -556,48 +200,7 @@

Submodules diff --git a/functions/development/transcribe/latest/static/documentation.html b/functions/development/transcribe/latest/static/documentation.html index d92df103..edbea1b6 100644 --- a/functions/development/transcribe/latest/static/documentation.html +++ b/functions/development/transcribe/latest/static/documentation.html @@ -165,48 +165,7 @@

Contents

@@ -220,323 +179,8 @@

transcribe package

Submodules#

-
-

transcribe.transcribe module#

-
-
-class transcribe.transcribe.BaseTask(audio_file: Path, transcription_output: dict | str, text_file: Path)[source]#
-

Bases: object

-

A task to write the transcription to file.

-
-
-do_task()[source]#
-

Try to perform the task storing an error if occurred.

-
-
-
-get_result() Tuple[str, str][source]#
-

Get the result of the task. If the task failed, the error will be returned, otherwise, the result will be the -text file name.

-
-
Returns:
-

The task’s result.

-
-
-
-
-
-is_failed() bool[source]#
-

Check if the task failed.

-
-
Returns:
-

Whether the task failed.

-
-
-
-
-
-to_tuple() Tuple[str, dict][source]#
-

Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

-
-
Returns:
-

The converted task.

-
-
-
-
-
-
-class transcribe.transcribe.BatchProcessor(audio_files: List[Path], output_directory: Path)[source]#
-

Bases: object

-

A batch processor to process batches of transcriptions. The batch processor is creating tasks and is aimed to be -working along the transcriber. It can be used with multiprocessing queue or run the tasks directly using the -associated methods.

-
-
-do_tasks()[source]#
-

Perform the tasks. Should be used if no multiprocessing queue is given to a transcriber.

-
-
-
-get_results() List[Tuple[bool, Tuple[str, str]]][source]#
-

Get the results of the tasks. The stored results are then cleared.

-
-
Returns:
-

The results of the tasks.

-
-
-
-
-
-get_tasks() List[BaseTask][source]#
-

Get the tasks to perform.

-
-
Returns:
-

The tasks to perform.

-
-
-
-
-
-process_batch(batch: List[dict | str])[source]#
-

Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch -processor.

-
-
Parameters:
-

batch – The batch of transcriptions to process.

-
-
-
-
-
-
-class transcribe.transcribe.PerChannelSpeechDiarizationBatchProcessor(audio_files: List[Path], output_directory: Path, n_channels: int, speakers: List[str])[source]#
-

Bases: BatchProcessor

-

A batch processor to process batches of transcriptions per channel. The batch processor is creating tasks with the -selected amount of channels given and is aimed to be working along the transcriber. It can be used with -multiprocessing queue or run the tasks directly using the associated methods.

-
-
-process_batch(batch: List[dict])[source]#
-

Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch -processor.

-
-
Parameters:
-

batch – The batch of transcriptions to process.

-
-
-
-
-
-
-class transcribe.transcribe.SpeechDiarizationBatchProcessor(audio_files: List[Path], output_directory: Path, speech_diarization: dict)[source]#
-

Bases: BatchProcessor

-

A batch processor to process batches of transcriptions with respect to a given speech diarization. The batch -processor is creating tasks and is aimed to be working along the transcriber. It can be used with multiprocessing -queue or run the tasks directly using the associated methods.

-
-
-process_batch(batch: List[dict])[source]#
-

Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch -processor.

-
-
Parameters:
-

batch – The batch of transcriptions to process.

-
-
-
-
-
-
-class transcribe.transcribe.SpeechDiarizationPerChannelTask(audio_file: Path, text_file: Path)[source]#
-

Bases: BaseTask

-

A task to write the transcription to file with respect to a given speech diarization per channel.

-
-
-do_task()[source]#
-

Try to perform the task storing an error if occurred.

-
-
-
-to_tuple() Tuple[str, dict][source]#
-

Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

-
-
Returns:
-

The converted task.

-
-
-
-
-
-property transcription_output_channels: List[Tuple[str, dict]]#
-

Get the transcription output channels.

-
-
Returns:
-

The transcription output channels.

-
-
-
-
-
-
-class transcribe.transcribe.SpeechDiarizationTask(audio_file: Path, transcription_output: dict, text_file: Path, speech_diarization: List[Tuple[float, float, str]])[source]#
-

Bases: BaseTask

-

A task to write the transcription to file with respect to a given speech diarization.

-
-
-to_tuple() Tuple[str, dict][source]#
-

Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

-
-
Returns:
-

The converted task.

-
-
-
-
-
-
-class transcribe.transcribe.Transcriber(model_name: str, device: str | None = None, use_flash_attention_2: bool | None = None, use_better_transformers: bool | None = None, assistant_model: str | None = None, max_new_tokens: int = 128, chunk_length_s: int = 30, batch_size: int = 2, spoken_language: str | None = None, translate_to_english: bool = False, return_timestamps: bool | Literal['word'] = False, per_channel_transcription: int = 0)[source]#
-

Bases: object

-

A transcription wrapper for the Huggingface’s ASR pipeline - -https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline to -use with OpenAI’s Whisper models - https://huggingface.co/openai.

-
-
-load()[source]#
-

Load the transcriber. Must be called before transcribing.

-
-
-
-transcribe(audio_files: List[Path], batch_processor: BatchProcessor | None = None, batches_queue: Queue | None = None, verbose: bool = False) List[List[dict]] | None[source]#
-

Transcribe the given audio files. The transcriptions will be sent to a queue or a batch processor for further -processing like writing to text files. If no queue or batch processor is given, the transcriptions outputs from -the pipeline will be returned. Otherwise, None is returned.

-
-
Parameters:
-
    -
  • audio_files – The audio files to transcribe.

  • -
  • batch_processor – A batch processor.

  • -
  • batches_queue – A multiprocessing queue to put the batches in.

  • -
  • verbose – Whether to show a progress bar. Default is False.

  • -
-
-
Returns:
-

The transcriptions outputs from the pipeline if no queue or batch processor is given, otherwise, -None.

-
-
-
-
-
-
-transcribe.transcribe.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Dict[str, Any] | None = None)[source]#
-
-
-
-transcribe.transcribe.transcribe(data_path: str | Path | List[str | Path], output_directory: str | None = None, model_name: str = 'openai/whisper-tiny', device: str | None = None, use_flash_attention_2: bool | None = None, use_better_transformers: bool | None = None, assistant_model: str | None = None, max_new_tokens: int = 128, chunk_length_s: int = 30, batch_size: int = 8, spoken_language: str | None = None, translate_to_english: bool = False, speech_diarization: Dict[str, List[Tuple[float, float, str]]] | None = None, speech_diarize_per_channel: int | None = None, speaker_labels: List[str] | None = None, use_multiprocessing: bool | int = False, verbose: bool = False)[source]#
-

Transcribe audio files into text files and collect additional data. The end result is a directory of transcribed -text files and a dataframe containing the following columns:

-
    -
  • audio_file - The audio file path.

  • -
  • transcription_file - The transcribed text file name in the output directory.

  • -
-

The transcription is based on Huggingface’s ASR pipeline - -https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline and -is tested with OpenAI’s Whisper models - https://huggingface.co/openai.

-

If one of the speaker diarization parameters are given (either speech_diarization or -speech_diarize_per_channel), the transcription will be written in a conversation format, where each speaker will -be written in a separate line:

-
speaker_1: text
-speaker_2: text
-speaker_1: text
-...
-
-
-
-
Parameters:
-
    -
  • data_path – A directory of audio files or a single file or a list of files to transcribe.

  • -
  • output_directory – Path to a directory to save all transcribed audio files. If not given, will save -the transcribed files in a temporary directory.

  • -
  • model_name – The model name to use. Should be a model from the OpenAI’s Whisper models for -best results (for example “tiny”, “base”, “large”, etc.). See here for more -information: https://huggingface.co/openai?search_models=whisper.

  • -
  • device – The device to use for inference. If not given, will use GPU if available.

  • -
  • use_flash_attention_2

    Whether to use the Flash Attention 2 implementation. It can be used only with -one of the following GPUs: Nvidia H series and Nvidia A series. T4 support -will be available soon.

    -

    Note: If both use_flash_attention_2 and -use_better_transformers are None, the optimization will be chosen -automatically according to the available resources.

    -

  • -
  • use_better_transformers

    Whether to use the Better Transformers library to further optimize the model. -Should be used for all use cases that do not support flash attention 2.

    -

    Note: If both use_flash_attention_2 and use_better_transformers are -None, the optimization will be chosen automatically according to the -available resources.

    -

  • -
  • assistant_model

    The assistant model name to use for inference. Notice that the optimizations -(flash attention 2 and better transformers) will be applied for the assistant as -well. Should be a model from Huggingface’s distil-whisper (see here for more -information: huggingface/distil-whisper).

    -

    Note: Currently an assistant model is only usable with batch size of 1.

    -

  • -
  • max_new_tokens – The maximum number of new tokens to generate. This is used to limit the -generation length. Default is 128 tokens.

  • -
  • chunk_length_s – The audio chunk to split the audio to (in seconds). Default is 30 seconds.

  • -
  • batch_size – The batch size to use for inference. Default is 2.

  • -
  • spoken_language – Aim whisper to know what language is spoken. If None, it will try to detect -it.

  • -
  • translate_to_english – Whether to translate the transcriptions to English.

  • -
  • speech_diarization

    A speech diarization dictionary with the file names to transcribe as keys and -their diarization as value. The diarization is a list of tuples: -(start, end, speaker). An example -for a diarization dictionary:

    -
    {
    -
    -
    -
    -
    -
    ”audio_file_name”: [
    -
    {

    “start”: 0.0, -“end”: 2.0, -“speaker”: “Agent”,

    -
    -
    -

    }, -{

    -
    -

    ”start”: 2.0, -“end”: 4.0, -“speaker”: “Client”,

    -
    -
    -
    -
    -

    }

    -

    Note: The diarization must be for the entire duration of the audio file (as long -as Whisper is predicting words up until then.

    -

  • -
  • speech_diarize_per_channel – Perform speech diarization per channel. Each speaker is expected to belong to -a separate channel in the audio. Notice: This will make the transcription -slower as each channel wil be transcribed separatly. If a speech diarization -is passed (via the speech_diarization parameter), this parameter is -ignored.

  • -
  • speaker_labels – A list of speaker labels by channel order to use for writing the -transcription with respect to per channel speech diarization. This won’t be -used together with a given speech diarization (via the speech_diarization -parameter).

  • -
  • use_multiprocessing – Whether to use multiprocessing to transcribe the audio files. Can be either a -boolean value or an integer. If True, will use the default amount of workers -(3): 1 for transcription, 1 for batch processing and 1 for task completion (such -as speech diarization and writing to files). To control the amount of tasks -completion workers, an integer can be provided to specify the amount of workers. -False, will use a single process. Default is False.

  • -
  • verbose – Whether to print the progress of the transcription. Default is False.

  • -
-
-
-
+
+

transcribe.transcribe module#

Module contents#

@@ -556,48 +200,7 @@

Submodules diff --git a/functions/development/translate/0.3.0/static/documentation.html b/functions/development/translate/0.3.0/static/documentation.html index 9e4fdd01..ed49c80c 100644 --- a/functions/development/translate/0.3.0/static/documentation.html +++ b/functions/development/translate/0.3.0/static/documentation.html @@ -165,11 +165,7 @@

Contents

@@ -183,52 +179,8 @@

translate package

Submodules#

-
-

translate.translate module#

-
-
-translate.translate.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Dict[str, Any] | None = None)[source]#
-
-
-
-translate.translate.translate(data_path: str | List[str] | Path, output_directory: str, model_name: str | None = None, source_language: str | None = None, target_language: str | None = None, device: str | None = None, model_kwargs: dict | None = None, batch_size: int = 1, translation_kwargs: dict | None = None, verbose: bool = False) Tuple[str, DataFrame, dict][source]#
-

Translate text files using a transformer model from Huggingface’s hub according to the source and target languages -given (or using the directly provided model name). The end result is a directory of translated text files and a -dataframe containing the following columns:

-
    -
  • text_file - The text file path.

  • -
  • translation_file - The translation text file name in the output directory.

  • -
-
-
Parameters:
-
    -
  • data_path – A directory of text files or a single file or a list of files to translate.

  • -
  • output_directory – Directory where the translated files will be saved.

  • -
  • model_name – The name of a model to load. If None, the model name is constructed using the source and -target languages parameters.

  • -
  • source_language – The source language code (e.g., ‘en’ for English).

  • -
  • target_language – The target language code (e.g., ‘en’ for English).

  • -
  • model_kwargs – Keyword arguments to pass regarding the loading of the model in HuggingFace’s pipeline -function.

  • -
  • device – The device index for transformers. Default will prefer cuda if available.

  • -
  • batch_size – The number of batches to use in translation. The files are translated one by one, but the -sentences can be batched.

  • -
  • translation_kwargs – Additional keyword arguments to pass to a transformers.TranslationPipeline when doing -the translation inference. Notice the batch size here is being added automatically.

  • -
  • verbose – Whether to present logs of a progress bar and errors. Default: True.

  • -
-
-
Returns:
-

A tuple of:

-
    -
  • Path to the output directory.

  • -
  • A dataframe dataset of the translated file names.

  • -
  • A dictionary of errored files that were not translated.

  • -
-

-
-
-
+
+

translate.translate module#

Module contents#

@@ -248,11 +200,7 @@

Submodules diff --git a/functions/development/translate/latest/static/documentation.html b/functions/development/translate/latest/static/documentation.html index 9e4fdd01..ed49c80c 100644 --- a/functions/development/translate/latest/static/documentation.html +++ b/functions/development/translate/latest/static/documentation.html @@ -165,11 +165,7 @@

Contents

@@ -183,52 +179,8 @@

translate package

Submodules#

-
-

translate.translate module#

-
-
-translate.translate.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Dict[str, Any] | None = None)[source]#
-
-
-
-translate.translate.translate(data_path: str | List[str] | Path, output_directory: str, model_name: str | None = None, source_language: str | None = None, target_language: str | None = None, device: str | None = None, model_kwargs: dict | None = None, batch_size: int = 1, translation_kwargs: dict | None = None, verbose: bool = False) Tuple[str, DataFrame, dict][source]#
-

Translate text files using a transformer model from Huggingface’s hub according to the source and target languages -given (or using the directly provided model name). The end result is a directory of translated text files and a -dataframe containing the following columns:

-
    -
  • text_file - The text file path.

  • -
  • translation_file - The translation text file name in the output directory.

  • -
-
-
Parameters:
-
    -
  • data_path – A directory of text files or a single file or a list of files to translate.

  • -
  • output_directory – Directory where the translated files will be saved.

  • -
  • model_name – The name of a model to load. If None, the model name is constructed using the source and -target languages parameters.

  • -
  • source_language – The source language code (e.g., ‘en’ for English).

  • -
  • target_language – The target language code (e.g., ‘en’ for English).

  • -
  • model_kwargs – Keyword arguments to pass regarding the loading of the model in HuggingFace’s pipeline -function.

  • -
  • device – The device index for transformers. Default will prefer cuda if available.

  • -
  • batch_size – The number of batches to use in translation. The files are translated one by one, but the -sentences can be batched.

  • -
  • translation_kwargs – Additional keyword arguments to pass to a transformers.TranslationPipeline when doing -the translation inference. Notice the batch size here is being added automatically.

  • -
  • verbose – Whether to present logs of a progress bar and errors. Default: True.

  • -
-
-
Returns:
-

A tuple of:

-
    -
  • Path to the output directory.

  • -
  • A dataframe dataset of the translated file names.

  • -
  • A dictionary of errored files that were not translated.

  • -
-

-
-
-
+
+

translate.translate module#

Module contents#

@@ -248,11 +200,7 @@

Submodules diff --git a/modules/development/openai_proxy_app/1.0.0/src/openai_proxy_app.py b/modules/development/openai_proxy_app/1.0.0/src/openai_proxy_app.py index a0e9df7a..f97ac1d6 100644 --- a/modules/development/openai_proxy_app/1.0.0/src/openai_proxy_app.py +++ b/modules/development/openai_proxy_app/1.0.0/src/openai_proxy_app.py @@ -29,7 +29,7 @@ print("Wrote /opt/app/openai_proxy_app.py") PY -exec gunicorn openai:app \ +exec gunicorn openai_proxy_app:app \ --chdir /opt/app \ --bind 0.0.0.0:8000 \ --worker-class uvicorn.workers.UvicornWorker \ diff --git a/modules/development/openai_proxy_app/1.0.0/static/openai_proxy_app.html b/modules/development/openai_proxy_app/1.0.0/static/openai_proxy_app.html index 3acfb690..56803091 100644 --- a/modules/development/openai_proxy_app/1.0.0/static/openai_proxy_app.html +++ b/modules/development/openai_proxy_app/1.0.0/static/openai_proxy_app.html @@ -171,7 +171,7 @@

Source code for openai_proxy_app.openai_proxy_app

print("Wrote /opt/app/openai_proxy_app.py") PY -exec gunicorn openai:app \ +exec gunicorn openai_proxy_app:app \ --chdir /opt/app \ --bind 0.0.0.0:8000 \ --worker-class uvicorn.workers.UvicornWorker \ diff --git a/modules/development/openai_proxy_app/1.0.0/static/source.html b/modules/development/openai_proxy_app/1.0.0/static/source.html index 939698ab..3388b964 100644 --- a/modules/development/openai_proxy_app/1.0.0/static/source.html +++ b/modules/development/openai_proxy_app/1.0.0/static/source.html @@ -59,7 +59,7 @@ print("Wrote /opt/app/openai_proxy_app.py") PY -exec gunicorn openai:app \ +exec gunicorn openai_proxy_app:app \ --chdir /opt/app \ --bind 0.0.0.0:8000 \ --worker-class uvicorn.workers.UvicornWorker \ diff --git a/modules/development/openai_proxy_app/latest/src/openai_proxy_app.py b/modules/development/openai_proxy_app/latest/src/openai_proxy_app.py index a0e9df7a..f97ac1d6 100644 --- a/modules/development/openai_proxy_app/latest/src/openai_proxy_app.py +++ b/modules/development/openai_proxy_app/latest/src/openai_proxy_app.py @@ -29,7 +29,7 @@ print("Wrote /opt/app/openai_proxy_app.py") PY -exec gunicorn openai:app \ +exec gunicorn openai_proxy_app:app \ --chdir /opt/app \ --bind 0.0.0.0:8000 \ --worker-class uvicorn.workers.UvicornWorker \ diff --git a/modules/development/openai_proxy_app/latest/static/openai_proxy_app.html b/modules/development/openai_proxy_app/latest/static/openai_proxy_app.html index 3acfb690..56803091 100644 --- a/modules/development/openai_proxy_app/latest/static/openai_proxy_app.html +++ b/modules/development/openai_proxy_app/latest/static/openai_proxy_app.html @@ -171,7 +171,7 @@

Source code for openai_proxy_app.openai_proxy_app

print("Wrote /opt/app/openai_proxy_app.py") PY -exec gunicorn openai:app \ +exec gunicorn openai_proxy_app:app \ --chdir /opt/app \ --bind 0.0.0.0:8000 \ --worker-class uvicorn.workers.UvicornWorker \ diff --git a/modules/development/openai_proxy_app/latest/static/source.html b/modules/development/openai_proxy_app/latest/static/source.html index 939698ab..3388b964 100644 --- a/modules/development/openai_proxy_app/latest/static/source.html +++ b/modules/development/openai_proxy_app/latest/static/source.html @@ -59,7 +59,7 @@ print("Wrote /opt/app/openai_proxy_app.py") PY -exec gunicorn openai:app \ +exec gunicorn openai_proxy_app:app \ --chdir /opt/app \ --bind 0.0.0.0:8000 \ --worker-class uvicorn.workers.UvicornWorker \ diff --git a/modules/development/tags.json b/modules/development/tags.json index 1a982877..4ee810bd 100644 --- a/modules/development/tags.json +++ b/modules/development/tags.json @@ -1 +1 @@ -{"categories": ["model-serving", "genai", "structured-ML"], "kind": ["generic", "monitoring_application"]} \ No newline at end of file +{"categories": ["genai", "model-serving", "structured-ML"], "kind": ["generic", "monitoring_application"]} \ No newline at end of file