diff --git a/.github/workflows/tests-dev.yml b/.github/workflows/tests-dev.yml index c86da61..e2ed8cc 100644 --- a/.github/workflows/tests-dev.yml +++ b/.github/workflows/tests-dev.yml @@ -38,7 +38,7 @@ jobs: - name: Run organize and build images run: | - domino piece organize --build-images --source-url=https://github.com/${{github.repository}} --tag-overwrite=dev + domino piece organize --build-images --source-url=https://github.com/${{github.repository}} --tag-overwrite=development - name: Install Tests Dependencies run: pip install -r requirements-tests.txt diff --git a/dependencies/Dockerfile_whisper b/dependencies/Dockerfile_whisper new file mode 100644 index 0000000..a7c87fb --- /dev/null +++ b/dependencies/Dockerfile_whisper @@ -0,0 +1,20 @@ +FROM ghcr.io/tauffer-consulting/domino-base-piece:gpu + + +# Install specific requirements +ENV TZ=UTC +ARG DEBIAN_FRONTEND=noninteractive +RUN apt update +RUN apt install ffmpeg -y +RUN apt install git -y + +# Need to copy operators source code +COPY config.toml domino/pieces_repository/ +COPY pieces domino/pieces_repository/pieces +COPY .domino domino/pieces_repository/.domino + +# Install specific python dependencies +RUN pip install -U openai-whisper==20231106 + +# Dowload Whisper model +RUN python3 -c "import whisper; whisper.load_model('tiny')" \ No newline at end of file diff --git a/pieces/AudioTranscriptionLocalPiece/metadata.json b/pieces/AudioTranscriptionLocalPiece/metadata.json new file mode 100644 index 0000000..091c5ea --- /dev/null +++ b/pieces/AudioTranscriptionLocalPiece/metadata.json @@ -0,0 +1,22 @@ +{ + "name": "AudioTranscriptionLocalPiece", + "description": "Runs transcription locally using Whisper, a general-purpose speech recognition model. Ref: https://github.com/openai/whisper", + "dependency": { + "dockerfile": "Dockerfile_whisper" + }, + "container_resources": { + "use_gpu": true, + "requests": { + "cpu": "1000m", + "memory": "3Gi" + }, + "limits": { + "cpu": "5000m", + "memory": "15Gi" + } + }, + "style": { + "node_label": "Audio Transcription Local", + "icon_class_name": "fa-solid:comment-dots" + } +} \ No newline at end of file diff --git a/pieces/AudioTranscriptionLocalPiece/models.py b/pieces/AudioTranscriptionLocalPiece/models.py new file mode 100644 index 0000000..824fdfd --- /dev/null +++ b/pieces/AudioTranscriptionLocalPiece/models.py @@ -0,0 +1,45 @@ +from pydantic import BaseModel, Field, FilePath +from typing import Union +from enum import Enum + + +class ModelSizeType(str, Enum): + tiny = "tiny" + base = "base" + small = "small" + medium = "medium" + large = "large" + + +class OutputTypeType(str, Enum): + string = "string" + file = "file" + both = "both" + + +class InputModel(BaseModel): + audio_file_path: str = Field( + description='The path to the audio file to process.', + json_schema_extra={ + "from_upstream": "always" + } + ) + output_type: OutputTypeType = Field( + default=OutputTypeType.string, + description='The type of output for the result text. Options are `string`, `file` or `both`. Default is `string`.', + ) + model_size: ModelSizeType = Field( + description='The size of the model to use. Default is tiny.', + default=ModelSizeType.tiny + ) + + +class OutputModel(BaseModel): + transcription_result: str = Field( + default="", + description="The result transcription text as a string." + ) + file_path_transcription_result: Union[FilePath, str] = Field( + default="", + description="The path to the text file with the transcription result." + ) diff --git a/pieces/AudioTranscriptionLocalPiece/piece.py b/pieces/AudioTranscriptionLocalPiece/piece.py new file mode 100644 index 0000000..43bbcbf --- /dev/null +++ b/pieces/AudioTranscriptionLocalPiece/piece.py @@ -0,0 +1,54 @@ +from domino.base_piece import BasePiece +from .models import InputModel, OutputModel +import whisper + + +class AudioTranscriptionLocalPiece(BasePiece): + + def piece_function(self, input_data: InputModel): + + self.logger.info("Loading model...") + model = whisper.load_model(input_data.model_size) + + self.logger.info("Transcribing audio file...") + result = model.transcribe(str(input_data.audio_file_path))["text"] + + if input_data.output_type == "string": + self.logger.info("Transcription complete successfully. Result returned as string.") + transcription_result = result + output_file_path = "" + elif input_data.output_type == "file": + self.logger.info("Transcription complete successfully. Result returned as file.") + transcription_result = "" + output_file_path = "transcription_result.txt" + with open(output_file_path, "w") as f: + f.write(result) + else: + self.logger.info("Transcription complete successfully. Result returned as string and file.") + transcription_result = result + output_file_path = "transcription_result.txt" + with open(output_file_path, "w") as f: + f.write(result) + + # Display result in the Domino GUI + self.format_display_result(input_data=input_data, string_transcription_result=result) + + return OutputModel( + transcription_result=transcription_result, + file_path_transcription_result=output_file_path + ) + + def format_display_result(self, input_data: InputModel, string_transcription_result: str): + md_text = f"""## Audio Transcription Result \n +{string_transcription_result} \n + +## Args +**model_size**: {input_data.model_size.value}\n +""" + file_path = f"{self.results_path}/display_result.md" + with open(file_path, "w") as f: + f.write(md_text) + self.display_result = { + "file_type": "md", + "file_path": file_path + } diff --git a/pieces/AudioTranscriptionLocalPiece/test-audio-to-transcribe.mp3 b/pieces/AudioTranscriptionLocalPiece/test-audio-to-transcribe.mp3 new file mode 100644 index 0000000..b88ed80 Binary files /dev/null and b/pieces/AudioTranscriptionLocalPiece/test-audio-to-transcribe.mp3 differ diff --git a/pieces/AudioTranscriptionLocalPiece/test_localtranscription_piece.py b/pieces/AudioTranscriptionLocalPiece/test_localtranscription_piece.py new file mode 100644 index 0000000..8981cb0 --- /dev/null +++ b/pieces/AudioTranscriptionLocalPiece/test_localtranscription_piece.py @@ -0,0 +1,20 @@ +from domino.testing import piece_dry_run +from pathlib import Path + + +test_file = str(Path(__file__).parent / "test-audio-to-transcribe.mp3") + + +def test_whisper_piece(): + input_data = { + "audio_file_path": test_file, + "model_size": "tiny", + "output_type": "both" + } + piece_output = piece_dry_run( + piece_name="AudioTranscriptionLocalPiece", + input_data=input_data, + ) + assert piece_output["transcription_result"] + assert piece_output["file_path_transcription_result"] + assert "audio" in piece_output.get("transcription_result", "").lower() diff --git a/pieces/AudioTranscriptionPiece/metadata.json b/pieces/AudioTranscriptionPiece/metadata.json index 68e3d7a..5c23938 100644 --- a/pieces/AudioTranscriptionPiece/metadata.json +++ b/pieces/AudioTranscriptionPiece/metadata.json @@ -19,6 +19,6 @@ ], "style": { "node_label": "OpenAI Audio Transcript", - "icon_class_name": "fa-solid:headset" + "icon_class_name": "fa-solid:comment-dots" } } \ No newline at end of file diff --git a/pieces/AudioTranscriptionPiece/models.py b/pieces/AudioTranscriptionPiece/models.py index f17d556..7894bcb 100644 --- a/pieces/AudioTranscriptionPiece/models.py +++ b/pieces/AudioTranscriptionPiece/models.py @@ -2,60 +2,44 @@ from enum import Enum - class OutputTypeType(str, Enum): - """ - Output type for the result text - """ file = "file" string = "string" - file_and_string = "file_and_string" + both = "both" class InputModel(BaseModel): - """ - AudioTranscriptPiece input model - """ audio_file_path: str = Field( - ..., description='The path to the audio file to process.', - + json_schema_extra={ + "from_upstream": "always" + } ) output_type: OutputTypeType = Field( default=OutputTypeType.string, - description='The type of output for the result text', - + description='The type of output for the result text. Options are `string`, `file` or `both`. Default is `string`.', ) temperature: float = Field( description="What sampling temperature to use, between 0 and 1", default=0.1, gt=0., le=1, - + ) class OutputModel(BaseModel): - """ - AudioTranscriptPiece output model - """ - message: str = Field( - description="Output message to log" - ) - string_transcription_result: str = Field( - default=None, + transcription_result: str = Field( + default="", description="The result transcription text as a string." ) file_path_transcription_result: str = Field( - default=None, - description="The result transcription text as a file path." + default="", + description="The path to the text file with the transcription result." ) class SecretsModel(BaseModel): - """ - AudioTranscriptPiece secret model - """ OPENAI_API_KEY: str = Field( description="OpenAI API key" - ) \ No newline at end of file + ) diff --git a/pieces/AudioTranscriptionPiece/piece.py b/pieces/AudioTranscriptionPiece/piece.py index 2220560..84232e8 100644 --- a/pieces/AudioTranscriptionPiece/piece.py +++ b/pieces/AudioTranscriptionPiece/piece.py @@ -16,7 +16,7 @@ def piece_function(self, input_data: InputModel, secrets_data: SecretsModel): # Input arguments are retrieved from the Input model object file_path = input_data.audio_file_path - + print("Making OpenAI audio transcription request...") try: full_audio = AudioSegment.from_mp3(file_path) @@ -31,9 +31,9 @@ def piece_function(self, input_data: InputModel, secrets_data: SecretsModel): endpoint = min((i+1)*ten_minutes, total_time-1) minutes = full_audio[i*ten_minutes:endpoint] minutes.export(f"audio_piece_{i}.mp3", format="mp3") - audio_file= open(f"audio_piece_{i}.mp3", "rb") + audio_file = open(f"audio_piece_{i}.mp3", "rb") transcript = client.audio.transcriptions.create( - model="whisper-1", + model="whisper-1", file=audio_file, temperature=input_data.temperature ) @@ -48,35 +48,31 @@ def piece_function(self, input_data: InputModel, secrets_data: SecretsModel): # Display result in the Domino GUI self.format_display_result(input_data=input_data, string_transcription_result=full_transcript) - + if input_data.output_type == "string": self.logger.info("Transcription complete successfully. Result returned as string.") - msg = f"Transcription complete successfully. Result returned as string." return OutputModel( - message=msg, - string_transcription_result=full_transcript + transcription_result=full_transcript, + file_path_transcription_result="" ) - + output_file_path = f"{self.results_path}/audio_transcription_result.txt" with open(output_file_path, "w") as f: f.write(full_transcript) if input_data.output_type == "file": self.logger.info(f"Transcription complete successfully. Result returned as file in {output_file_path}") - msg = f"Transcription complete successfully. Result returned as file." return OutputModel( - message=msg, + transcription_result="", file_path_transcription_result=output_file_path ) self.logger.info(f"Transcription complete successfully. Result returned as string and file in {output_file_path}") - msg = f"Transcription complete successfully. Result returned as string and file." return OutputModel( - message=msg, - string_transcription_result=full_transcript, + transcription_result=full_transcript, file_path_transcription_result=output_file_path ) - + def format_display_result(self, input_data: InputModel, string_transcription_result: str): md_text = f""" ## Generated transcription: \n @@ -91,4 +87,4 @@ def format_display_result(self, input_data: InputModel, string_transcription_res self.display_result = { "file_type": "md", "file_path": file_path - } \ No newline at end of file + } diff --git a/pieces/ImageGeneratorPiece/models.py b/pieces/ImageGeneratorPiece/models.py index 54c135d..ff5d3d5 100644 --- a/pieces/ImageGeneratorPiece/models.py +++ b/pieces/ImageGeneratorPiece/models.py @@ -34,7 +34,6 @@ class InputModel(BaseModel): ImageGeneratorPiece input model """ prompt: str = Field( - ..., description="A text description of the desired image", ) size: ImageSize = Field( @@ -43,7 +42,7 @@ class InputModel(BaseModel): ) image_format: ImageFormat = Field( default=ImageFormat.url, - description="The format in which the generated image is returned", + description="The format in which the generated image is returned", ) output_type: OutputTypeType = Field( default=OutputTypeType.string,