Skip to content

Commit

Permalink
youtbe parser
Browse files Browse the repository at this point in the history
  • Loading branch information
nicola-corbellini committed Nov 17, 2023
1 parent 2ce7d01 commit c31f569
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 135 deletions.
18 changes: 1 addition & 17 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,10 @@ permissions:
env:
PLUGIN_JSON: "0.0.1"
TAG_EXISTS: false
PLUGIN_NAME: "my_plugin"
PLUGIN_NAME: "catube"

jobs:
# This will be deleted by setup.py
check:
runs-on: ubuntu-latest
outputs:
plugin_name: ${{ steps.init.outputs.plugin_name }}
steps:
- name: Get plugin name
id: init
run: |
echo "plugin_name=${{ env.PLUGIN_NAME }}" >> $GITHUB_OUTPUT
# This is the end of the removed section
release:
# This will be deleted by setup.py
needs: check
if: startsWith(needs.check.outputs.plugin_name, 'MY_PLUGIN') == false
# This is the end of the removed section
runs-on: ubuntu-latest
steps:
- name: Checkout
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# My plugin
# Catube

[![awesome plugin](https://custom-icon-badges.demolab.com/static/v1?label=&message=awesome+plugin&color=383938&style=for-the-badge&logo=cheshire_cat_ai)](https://)
[![Awesome plugin](https://custom-icon-badges.demolab.com/static/v1?label=&message=Awesome+plugin&color=000000&style=for-the-badge&logo=cheshire_cat_ai)](https://)
Expand Down
38 changes: 38 additions & 0 deletions catube.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import re
from abc import ABC
from typing import Iterator

from pytube import extract
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter, JSONFormatter
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders.schema import Blob

from cat.mad_hatter.decorators import hook


def parse_youtube_url(url: str) -> str:
data = re.findall(r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", url)
if data:
return data[0]
return ""


class YoutubeParser(BaseBlobParser, ABC):
def __init__(self):
self.formatter = TextFormatter()

def lazy_parse(self, blob: Blob) -> Iterator[Document]:
video_id = extract.video_id(blob.source)

transcript = YouTubeTranscriptApi.get_transcripts([video_id], languages=["en", "it"], preserve_formatting=True)
text = self.formatter.format_transcript(transcript[0][video_id])

yield Document(page_content=text, metadata={})


@hook
def rabbithole_instantiates_parsers(file_handlers: dict, cat) -> dict:
file_handlers["video/mp4"] = YoutubeParser()
return file_handlers
31 changes: 0 additions & 31 deletions my_plugin.py

This file was deleted.

14 changes: 7 additions & 7 deletions plugin.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"name": "My plugin",
"name": "Catube",
"version": "0.0.1",
"description": "Description of my_plugin.",
"author_name": "Me",
"author_url": "https://mywebsite.me",
"plugin_url": "https://github.com/my_name/my_plugin",
"tags": "cat, template, example",
"thumb": "https://raw.githubusercontent.com/my_repo_path/my_plugin.png"
"description": "Description of catube.",
"author_name": "Nicola Corbellini",
"author_url": "",
"plugin_url": "https://github.com/Furrmidable-Crew/CaTube",
"tags": "Cat, RabbitHole, DocumentsLoading",
"thumb": "https://raw.githubusercontent.com/my_repo_path/catube.png"
}
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
youtube-transcript-api
pytube
79 changes: 0 additions & 79 deletions setup.py

This file was deleted.

0 comments on commit c31f569

Please sign in to comment.