Merge pull request #108 from chenqianhe/main

Add faster-whisper
mli · Oct 25, 2023 · 638f6d8 · 638f6d8
2 parents e898bcc + 64b4063
commit 638f6d8
Show file tree

Hide file tree

Showing 10 changed files with 226 additions and 31 deletions.
diff --git a/.github/workflows/ci.yml → .github/workflows/base.yml b/.github/workflows/ci.yml → .github/workflows/base.yml
@@ -39,8 +39,6 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install .
-          pip install pytest black
+          pip install pytest
       - name: Run Test
         run: pytest test/
-      - name: Run Lint
-        run: black . --check
diff --git a/.github/workflows/faster-whisper b/.github/workflows/faster-whisper
@@ -0,0 +1,44 @@
+name: Test Faster Whisper
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+jobs:
+  lint_and_test:
+    runs-on: ${{ matrix.os }}-latest
+    strategy:
+      matrix:
+        python-version: ['3.9', '3.10']
+        #  macos did not support m1 for now
+        os: [ubuntu, windows, macos]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Set Variables
+        id: set_variables
+        shell: bash
+        run: |
+          echo "PY=$(python -c 'import hashlib, sys;print(hashlib.sha256(sys.version.encode()+sys.executable.encode()).hexdigest())')" >> $GITHUB_OUTPUT
+          echo "PIP_CACHE=$(pip cache dir)" >> $GITHUB_OUTPUT
+      - name: Cache PIP
+        uses: actions/cache@v3
+        with:
+          path: ${{ steps.set_variables.outputs.PIP_CACHE }}
+          key: ${{ runner.os }}-pip-${{ steps.set_variables.outputs.PY }}
+
+      - name: Setup ffmpeg for differnt platforms 
+        uses: FedericoCarboni/setup-ffmpeg@master
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install ".[faster]"
+          pip install pytest
+      - name: Run Test
+        run: WHISPER_MODE=faster pytest test/
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,40 @@
+name: Test Lint
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+jobs:
+  lint:
+    runs-on: ${{ matrix.os }}-latest
+    strategy:
+      matrix:
+        python-version: ['3.9']
+        os: [ubuntu]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Set Variables
+        id: set_variables
+        shell: bash
+        run: |
+          echo "PY=$(python -c 'import hashlib, sys;print(hashlib.sha256(sys.version.encode()+sys.executable.encode()).hexdigest())')" >> $GITHUB_OUTPUT
+          echo "PIP_CACHE=$(pip cache dir)" >> $GITHUB_OUTPUT
+      - name: Cache PIP
+        uses: actions/cache@v3
+        with:
+          path: ${{ steps.set_variables.outputs.PIP_CACHE }}
+          key: ${{ runner.os }}-pip-${{ steps.set_variables.outputs.PY }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install black
+
+      - name: Run Lint
+        run: black . --check
diff --git a/README.md b/README.md
@@ -2,6 +2,33 @@
 
 AutoCut 对你的视频自动生成字幕。然后你选择需要保留的句子，AutoCut 将对你视频中对应的片段裁切并保存。你无需使用视频编辑软件，只需要编辑文本文件即可完成剪切。
 
+**2023.10.14更新**：支持 faster-whisper 和指定依赖（但由于 Action 限制暂时移除了 faster-whisper 的测试运行）
+
+```shell
+# for whisper only
+pip install .
+
+# for whisper and faster-whisper
+pip install '.[faster]'
+
+# for whisper and openai-whisper
+pip install '.[openai]'
+
+# for all
+pip install '.[all]'
+```
+
+```shell
+# using faster-whisper
+autocut -t xxx --whisper-mode=faster
+```
+
+```shell
+# using openai api
+export OPENAI_API_KEY=sk-xxx
+autocut -t xxx --whisper-mode=openai --openai-rpm=3
+```
+
 **2023.8.13更新**：支持调用 Openai Whisper API
 ```shell
 export OPENAI_API_KEY=sk-xxx

diff --git a/autocut/__init__.py b/autocut/__init__.py
@@ -1 +1 @@
-__version__ = "0.1.0"
+__version__ = "0.2.0"
diff --git a/autocut/transcribe.py b/autocut/transcribe.py
@@ -29,6 +29,11 @@ def __init__(self, args):
                     self.args.openai_rpm, self.sampling_rate
                 )
                 self.whisper_model.load()
+            elif self.args.whisper_mode == WhisperMode.FASTER.value:
+                self.whisper_model = whisper_model.FasterWhisperModel(
+                    self.sampling_rate
+                )
+                self.whisper_model.load(self.args.whisper_model, self.args.device)
         logging.info(f"Done Init model in {time.time() - tic:.1f} sec")
 
     def run(self):
@@ -93,6 +98,7 @@ def _transcribe(
                 audio, speech_array_indices, self.args.lang, self.args.prompt
             )
             if self.args.whisper_mode == WhisperMode.WHISPER.value
+            or self.args.whisper_mode == WhisperMode.FASTER.value
             else self.whisper_model.transcribe(
                 input, audio, speech_array_indices, self.args.lang, self.args.prompt
             )

diff --git a/autocut/type.py b/autocut/type.py
@@ -80,6 +80,7 @@ def get_values():
 class WhisperMode(Enum):
     WHISPER = "whisper"
     OPENAI = "openai"
+    FASTER = "faster"
 
     @staticmethod
     def get_values():

diff --git a/autocut/whisper_model.py b/autocut/whisper_model.py
@@ -166,7 +166,12 @@ def __init__(self, rpm: int, sample_rate=16000):
             raise Exception("OPENAI_API_KEY is not set")
 
     def load(self, model_name: Literal["whisper-1"] = "whisper-1"):
-        import openai
+        try:
+            import openai
+        except ImportError:
+            raise Exception(
+                "Please use openai mode(pip install '.[openai]') or all mode(pip install '.[all]')"
+            )
         from functools import partial
 
         self.whisper_model = partial(openai.Audio.transcribe, model=model_name)
@@ -303,3 +308,83 @@ def gen_srt(self, transcribe_results: List[srt.Subtitle]):
                 )
             subs.append(subtitle)
         return subs
+
+
+class FasterWhisperModel(AbstractWhisperModel):
+    def __init__(self, sample_rate=16000):
+        super().__init__("faster-whisper", sample_rate)
+        self.device = None
+
+    def load(
+        self,
+        model_name: Literal[
+            "tiny", "base", "small", "medium", "large", "large-v2"
+        ] = "small",
+        device: Union[Literal["cpu", "cuda"], None] = None,
+    ):
+        try:
+            from faster_whisper import WhisperModel
+        except ImportError:
+            raise Exception(
+                "Please use faster mode(pip install '.[faster]') or all mode(pip install '.[all]')"
+            )
+
+        self.device = device if device else "cpu"
+        self.whisper_model = WhisperModel(model_name, self.device)
+
+    def _transcribe(self):
+        raise Exception("Not implemented")
+
+    def transcribe(
+        self,
+        audio: np.ndarray,
+        speech_array_indices: List[SPEECH_ARRAY_INDEX],
+        lang: LANG,
+        prompt: str,
+    ):
+        res = []
+        for seg in speech_array_indices:
+            segments, info = self.whisper_model.transcribe(
+                audio[int(seg["start"]) : int(seg["end"])],
+                task="transcribe",
+                language=lang,
+                initial_prompt=prompt,
+                vad_filter=False,
+            )
+            segments = list(segments)  # The transcription will actually run here.
+            r = {"origin_timestamp": seg, "segments": segments, "info": info}
+            res.append(r)
+        return res
+
+    def gen_srt(self, transcribe_results):
+        subs = []
+
+        def _add_sub(start, end, text):
+            subs.append(
+                srt.Subtitle(
+                    index=0,
+                    start=datetime.timedelta(seconds=start),
+                    end=datetime.timedelta(seconds=end),
+                    content=cc.convert(text.strip()),
+                )
+            )
+
+        prev_end = 0
+        for r in transcribe_results:
+            origin = r["origin_timestamp"]
+            for seg in r["segments"]:
+                s = dict(start=seg.start, end=seg.end, text=seg.text)
+                start = s["start"] + origin["start"] / self.sample_rate
+                end = min(
+                    s["end"] + origin["start"] / self.sample_rate,
+                    origin["end"] / self.sample_rate,
+                )
+                if start > end:
+                    continue
+                # mark any empty segment that is not very short
+                if start > prev_end + 1.0:
+                    _add_sub(prev_end, start, "< No Speech >")
+                _add_sub(start, end, s["text"])
+                prev_end = end
+
+        return subs
diff --git a/setup.py b/setup.py
@@ -1,8 +1,8 @@
 from setuptools import setup, find_packages
 
 requirements = [
+    "ffmpeg-python",
     "moviepy",
-    "openai",
     "openai-whisper",
     "opencc-python-reimplemented",
     "parameterized",
@@ -16,6 +16,11 @@
 setup(
     name="autocut",
     install_requires=requirements,
+    extras_require={
+        "all": ["openai", "faster-whisper"],
+        "openai": ["openai"],
+        "faster": ["faster-whisper"],
+    },
     packages=find_packages(),
     entry_points={
         "console_scripts": [

diff --git a/test/config.py b/test/config.py
@@ -43,29 +43,18 @@
 
 
 class TestArgs:
-    def __init__(
-        self,
-        encoding="utf-8",
-        sampling_rate=16000,
-        bitrate="10m",
-        lang="zh",
-        prompt="",
-        whisper_model="small",
-        device=None,
-        vad=False,
-        force=False,
-        whisper_mode="whisper",
-        openai_rpm=3,
-    ):
+    def __init__(self):
         self.inputs = []
-        self.bitrate = bitrate
-        self.encoding = encoding
-        self.sampling_rate = sampling_rate
-        self.lang = lang
-        self.prompt = prompt
-        self.whisper_model = whisper_model
-        self.device = device
-        self.vad = vad
-        self.force = force
-        self.whisper_mode = whisper_mode
-        self.openai_rpm = openai_rpm
+        self.bitrate = "10m"
+        self.encoding = "utf-8"
+        self.sampling_rate = 16000
+        self.lang = "zh"
+        self.prompt = ""
+        self.whisper_model = "small"
+        self.device = None
+        self.vad = False
+        self.force = False
+        self.whisper_mode = (
+            "faster" if os.environ.get("WHISPER_MODE") == "faster" else "whisper"
+        )
+        self.openai_rpm = 3