xsInit

profusion · May 29, 2023 · 8b434f4 · 8b434f4
commit 8b434f4
Show file tree

Hide file tree

Showing 55 changed files with 3,626 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+/.venv/
+/dist/
+/build/
+__pycache__
+*.pyc
+/videos/
+.DS_Store
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,44 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+
+default_install_hook_types: [pre-commit, pre-push, pre-merge-commit]
+
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+    -   id: check-executables-have-shebangs
+    -   id: check-json
+    -   id: check-merge-conflict
+    -   id: check-toml
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+
+-   repo: https://github.com/python/black
+    rev: 23.3.0
+    hooks:
+    -   id: black
+
+-   repo: https://github.com/sqlalchemyorg/zimports/
+    rev: v0.4.5
+    hooks:
+    -   id: zimports
+
+-   repo: https://github.com/pycqa/flake8
+    rev: 6.0.0
+    hooks:
+    -   id: flake8
+        additional_dependencies:
+          - flake8-import-order
+          - flake8-import-single
+          - flake8-builtins
+          - flake8-future-annotations
+          - flake8-docstrings
+          - flake8-rst-docstrings
+          - pygments
+
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.3.0
+    hooks:
+    -   id: mypy
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.11
diff --git a/README.rst b/README.rst
@@ -0,0 +1,190 @@
+ProFUSION Video Transcribe
+==========================
+
+Install
+-------
+
+Install the project using `Poetry <https://python-poetry.org/docs/#installation>`_:
+
+.. code-block:: console
+
+    $ poetry install --with dev
+    Installing dependencies from lock file
+    ...
+    Installing the current project: pf-video-transcribe
+
+This project uses `Faster Whisper <https://pypi.org/project/faster-whisper/>`_,
+a faster implementation of `OpenAI's Whisper <https://openai.com/research/whisper>`_,
+which in turn is built on top of `CTranslate2 <https://opennmt.net/CTranslate2/index.html>`_
+hardware optimizations, that requires installation of **NVidia CUDA libraries**, see
+`their installation instructions <https://opennmt.net/CTranslate2/installation.html>`_.
+
+Run
+---
+
+Run the command line tool:
+
+.. code-block:: console
+
+    $ pf-video-transcribe --help
+
+All commands take ``--log=LEVEL`` or ``--log=DOMAIN:LEVEL`` to change the
+log level of every package, such as ``pf_video_transcribe.transcribe``,
+``faster_whisper`` and so on. If no domain is given, then the provided level
+applies to all log domains. This is a global option and should be specified
+before the subcommand.
+
+Subcommands are explained in the next sections.
+
+Transcription
+=============
+
+Given that the transcription is a heavy process and takes a lot to load the model
+and then to process each media file, it's implemented as a batch operation that
+generates an intermediate in the `JSON Lines <https://jsonlines.org/>`_
+(``".jsonl"``) format, with a ``"header"`` line followed by all the ``"segment"``,
+ended by a ``"finished"`` line with success or failure indicator. Each ``segment``
+carries the useful information extracted by
+`OpenAI Whisper <https://github.com/openai/whisper>`_:
+
+.. code-block:: console
+
+    $ pf-video-transcribe transcribe videos/my-video.mp4 videos/other-video.mp4
+
+
+This will generate ``videos/my-video.jsonl`` and ``videos/other-video.jsonl``.
+
+Note that the first time it will take a lot to download the model from the internet.
+In the next iterations, the local model will be used, but first they will be checked
+remotely -- which can also take time. Using the ``--local`` flag will skip that check.
+
+The language is auto-detected from the first 30 seconds of actual sound (silent is
+ignored), but if you do know the language, use the ``--language=LANG`` flag.
+
+Audio Speech Recognition (ASR) models work on slices of the media, producing segments
+that are smaller than an actual human language sentence/phrase.
+The ``--merge-threshold=SECONDS`` will merge sibling segments if:
+``next_segment.start - last_segment.end <= merge_threshold``. The default is 1 second.
+
+A more complex example:
+
+.. code-block:: console
+
+    $ pf-video-transcribe \
+          --log=DEBUG \
+          transcribe \
+          --local \
+          --language=pt \
+          --merge-threshold=5 \
+          videos/my-video.mp4 videos/other-video.mp4
+
+With the transcribed ``".jsonl"`` one can convert to more usable formats,
+see the next sections.
+
+
+Convert to HTML
+===============
+
+This generates the HTML meant to easy viewing of the result, a ``<video>`` linking
+to the transcribed media alongside a ``<track kind="subtitles">`` linking to the
+subtitles, the thumbnail to be used by `OpenGraph <https://ogp.me/>`_ ``og:image``
+and the actual transcription segments.
+
+Note: both ``.vtt`` (subtitles) and ``.jpeg`` (thumbnail) are auto-generated
+if they don't exist or if they are older than the actual input ``.jsonl``.
+
+
+Convert to VTT
+==============
+
+Web Video Text Track is a subtitle specified by the
+`W3C <https://w3c.github.io/webvtt/>`_ and used by all web browsers whenever
+specified inside the ``<video>`` element.
+
+The conversion takes parameter ``--duration-threshold=SECONDS`` to control the maximum
+duration of a single subtitle entry.
+
+.. code-block:: console
+
+    $ pf-video-transcribe vtt videos/*.jsonl
+
+
+Convert to SRT
+==============
+
+SRT or SubRip is a defacto standard subtitle format that most media players will take.
+The conversion takes parameter ``--duration-threshold=SECONDS`` to control the maximum
+duration of a single subtitle entry.
+
+.. code-block:: console
+
+    $ pf-video-transcribe srt videos/*.jsonl
+
+
+Create Thumbnail
+================
+
+Uses `FFmpeg <https://ffmpeg.org/>`_ to generate a thumbnail from the video or
+its transcription. The ``--size=WIDTHxHEIGHT`` allows to override the default
+``320x-1`` (-1 is used to calculate that dimension from the other, keeping the
+aspect ratio).
+
+.. code-block:: console
+
+    $ pf-video-transcribe thumbnail videos/*.jsonl
+
+
+Creating Index HTML
+===================
+
+Recursively scans the given directories looking for ``.html`` files, which
+can be produced by this tool or not. The generated index will take the ``<title>``
+and ``<meta property="og:image">`` to gather the actual title or preview.
+
+It's a very simple way to generate a landing page.
+
+.. code-block:: console
+
+    $ pf-video-transcribe index_html videos/
+
+
+Serving (Development)
+=====================
+
+While developing this tool or playing with parameters it's useful to serve
+the files from ``http://`` as the ``file://`` will have some issues with
+video files (security limitations). By default serves at ``--port=8000``.
+
+.. code-block:: console
+
+    $ pf-video-transcribe serve videos/
+
+
+Development
+-----------
+
+Install the project with development dependencies:
+
+.. code-block:: console
+
+    $ poetry install --with dev
+    Installing dependencies from lock file
+    ...
+    Installing the current project: pf-video-transcribe
+
+
+Install `pre-commit <https://pre-commit.com/>`_ in your machine, then install the GIT Hooks:
+
+.. code-block:: console
+
+    $ pre-commit install
+    pre-commit installed at .git/hooks/pre-commit
+    pre-commit installed at .git/hooks/pre-push
+    pre-commit installed at .git/hooks/pre-merge-commit
+
+
+Used tools:
+
+* Code Formatter: `Black <https://black.readthedocs.io/>`_
+* Static Type Checker: `MyPy <https://mypy.readthedocs.io/>`_
+* Style Enforcement/Linter: `Flake8 <https://flake8.pycqa.org/>`_
diff --git a/pf_video_transcribe/__init__.py b/pf_video_transcribe/__init__.py
diff --git a/pf_video_transcribe/__main__.py b/pf_video_transcribe/__main__.py
@@ -0,0 +1,40 @@
+import argparse
+
+from . import log
+from .html import cli as html
+from .index_html import cli as index_html
+from .serve import cli as serve
+from .srt import cli as srt
+from .thumbnail import cli as thumbnail
+from .transcribe import cli as transcribe
+from .vtt import cli as vtt
+
+
+def create_arg_parser() -> argparse.ArgumentParser:
+    ap = argparse.ArgumentParser()
+    log.add_arguments(ap)
+
+    sub = ap.add_subparsers()
+    transcribe.add_sub_parser(sub)
+    html.add_sub_parser(sub)
+    vtt.add_sub_parser(sub)
+    srt.add_sub_parser(sub)
+    thumbnail.add_sub_parser(sub)
+    index_html.add_sub_parser(sub)
+    serve.add_sub_parser(sub)
+
+    return ap
+
+
+def cli() -> None:
+    ap = create_arg_parser()
+    args = ap.parse_args()
+    log.config(args)
+
+    handle = getattr(args, "handle", None)
+    if handle is not None:
+        handle(args)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/pf_video_transcribe/abstract_subtitles/__init__.py b/pf_video_transcribe/abstract_subtitles/__init__.py
diff --git a/pf_video_transcribe/abstract_subtitles/cli.py b/pf_video_transcribe/abstract_subtitles/cli.py
@@ -0,0 +1,43 @@
+from argparse import ArgumentParser
+import textwrap
+
+from ..utils import check_file_exists
+
+
+def add_arguments(ap: ArgumentParser, add_files: bool) -> None:
+    ap.add_argument(
+        "-t",
+        "--duration-threshold",
+        type=float,
+        default=10.0,
+        help=textwrap.dedent(
+            """\
+            Duration (in seconds) to split each segment. The split will
+            happen at word boundary, no words will be split.
+
+            Default: %(default)s seconds
+        """
+        ),
+    )
+    ap.add_argument(
+        "-f",
+        "--force",
+        default=False,
+        action="store_true",
+        help=textwrap.dedent(
+            """\
+            Force regeneration of existing files.
+
+            By default, if the generated file timestamp (mtime) is newer than the
+            source (jsonl), then it will be skipped.
+        """
+        ),
+    )
+
+    if add_files:
+        ap.add_argument(
+            "file",
+            nargs="+",
+            help="jsonl file to be processed",
+            type=check_file_exists,
+        )
diff --git a/pf_video_transcribe/abstract_subtitles/converter.py b/pf_video_transcribe/abstract_subtitles/converter.py
@@ -0,0 +1,18 @@
+from typing import TypedDict
+
+from ..converter import AbstractJsonlConverter
+from ..jsonl.reader import Reader
+from ..utils import iter_split_segments
+
+KT = TypedDict(
+    "KT",
+    {
+        "duration_threshold": float,
+    },
+)
+
+
+class AbstractSubtitlesConverter(AbstractJsonlConverter[KT]):
+    def get_template_context(self, reader: Reader) -> dict:
+        duration_threshold = self.kwargs["duration_threshold"]
+        return {"segments": iter_split_segments(iter(reader), duration_threshold)}