From 02e0fbdbc5563dadbc277cb72cfd3a3e1418464f Mon Sep 17 00:00:00 2001 From: Davincible Date: Wed, 26 Apr 2023 01:45:34 +0200 Subject: [PATCH 1/4] fix: download speed --- client.go | 336 ++++++++++++++++++++++++++-------- client_test.go | 14 +- downloader/downloader_test.go | 5 +- format_list.go | 2 + itag_test.go | 2 +- playlist.go | 81 ++++++-- utils.go | 105 +++++++++++ video.go | 4 + video_id.go | 1 + video_test.go | 36 +++- 10 files changed, 486 insertions(+), 100 deletions(-) create mode 100644 utils.go diff --git a/client.go b/client.go index 438b8990..1a574f2e 100644 --- a/client.go +++ b/client.go @@ -4,11 +4,24 @@ import ( "bytes" "context" "encoding/json" + "errors" "fmt" "io" "log" + "math/rand" "net/http" "strconv" + "sync" +) + +const ( + Size1Kb = 1024 + Size1Mb = Size1Kb * 1024 + Size10Mb = Size1Mb * 10 +) + +var ( + ErrNoFormat = errors.New("no video format provided") ) // Client offers methods to download video metadata and video streams. @@ -20,8 +33,18 @@ type Client struct { // If not set, http.DefaultClient will be used HTTPClient *http.Client + // MaxRoutines to use when downloading a video. + MaxRoutines int + + // ChunkSize to use when downloading videos in chunks. Default is Size10Mb. + ChunkSize int64 + // playerCache caches the JavaScript code of a player response playerCache playerCache + + client *clientInfo + + consentID string } // GetVideo fetches video metadata @@ -35,58 +58,62 @@ func (c *Client) GetVideoContext(ctx context.Context, url string) (*Video, error if err != nil { return nil, fmt.Errorf("extractVideoID failed: %w", err) } + return c.videoFromID(ctx, id) } func (c *Client) videoFromID(ctx context.Context, id string) (*Video, error) { - body, err := c.videoDataByInnertube(ctx, id, webClient) + c.client = &androidClient + + body, err := c.videoDataByInnertube(ctx, id) if err != nil { return nil, err } - v := &Video{ + v := Video{ ID: id, } - err = v.parseVideoInfo(body) // return early if all good - if err == nil { - return v, nil + if err = v.parseVideoInfo(body); err == nil { + return &v, nil } // If the uploader has disabled embedding the video on other sites, parse video page - if err == ErrNotPlayableInEmbed { + if errors.Is(err, ErrNotPlayableInEmbed) { // additional parameters are required to access clips with sensitiv content html, err := c.httpGetBodyBytes(ctx, "https://www.youtube.com/watch?v="+id+"&bpctr=9999999999&has_verified=1") if err != nil { return nil, err } - return v, v.parseVideoPage(html) + return &v, v.parseVideoPage(html) } // If the uploader marked the video as inappropriate for some ages, use embed player - if err == ErrLoginRequired { - bodyEmbed, errEmbed := c.videoDataByInnertube(ctx, id, embeddedClient) + if errors.Is(err, ErrLoginRequired) { + c.client = &embeddedClient + + bodyEmbed, errEmbed := c.videoDataByInnertube(ctx, id) if errEmbed == nil { errEmbed = v.parseVideoInfo(bodyEmbed) } if errEmbed == nil { - return v, nil + return &v, nil } // private video clearly not age-restricted and thus should be explicit if errEmbed == ErrVideoPrivate { - return v, errEmbed + return &v, errEmbed } // wrapping error so its clear whats happened - return v, fmt.Errorf("can't bypass age restriction: %w", errEmbed) + return &v, fmt.Errorf("can't bypass age restriction: %w", errEmbed) } // undefined error - return v, err + return &v, err } type innertubeRequest struct { @@ -95,6 +122,9 @@ type innertubeRequest struct { Continuation string `json:"continuation,omitempty"` Context inntertubeContext `json:"context"` PlaybackContext playbackContext `json:"playbackContext,omitempty"` + ContentCheckOK bool `json:"contentCheckOk"` + racyCheckOk bool `json:"racyCheckOk"` + Params string `json:"params"` } type playbackContext struct { @@ -102,7 +132,8 @@ type playbackContext struct { } type contentPlaybackContext struct { - SignatureTimestamp string `json:"signatureTimestamp"` + // SignatureTimestamp string `json:"signatureTimestamp"` + html5Preference string `json:"html5Preference"` } type inntertubeContext struct { @@ -110,68 +141,78 @@ type inntertubeContext struct { } type innertubeClient struct { - HL string `json:"hl"` - GL string `json:"gl"` - ClientName string `json:"clientName"` - ClientVersion string `json:"clientVersion"` + HL string `json:"hl"` + GL string `json:"gl"` + ClientName string `json:"clientName"` + ClientVersion string `json:"clientVersion"` + AndroidSDKVersion int `json:"androidSDKVersion,omitempty"` + UserAgent string `json:"userAgent,omitempty"` + TimeZone string `json:"timeZone"` + UTCOffset int `json:"utcOffsetMinutes"` } // client info for the innertube API type clientInfo struct { - name string - key string - version string + name string + key string + version string + userAgent string + androidVersion int } var ( // might add ANDROID and other in future, but i don't see reason yet webClient = clientInfo{ - name: "WEB", - version: "2.20210617.01.00", - key: "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", + name: "WEB", + version: "2.20210617.01.00", + key: "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", + userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", } - embeddedClient = clientInfo{ - name: "WEB_EMBEDDED_PLAYER", - version: "1.19700101", - key: "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", // seems like same key works for both clients - } -) - -func (c *Client) videoDataByInnertube(ctx context.Context, id string, clientInfo clientInfo) ([]byte, error) { - config, err := c.getPlayerConfig(ctx, id) - if err != nil { - return nil, err + androidClient = clientInfo{ + name: "ANDROID", + version: "17.31.35", + key: "AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w", + userAgent: "com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip", + androidVersion: 30, } - // fetch sts first - sts, err := config.getSignatureTimestamp() - if err != nil { - return nil, err + embeddedClient = clientInfo{ + name: "WEB_EMBEDDED_PLAYER", + version: "1.19700101", + key: "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", // seems like same key works for both clients + userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", } +) - context := prepareInnertubeContext(clientInfo) - +func (c *Client) videoDataByInnertube(ctx context.Context, id string) ([]byte, error) { data := innertubeRequest{ - VideoID: id, - Context: context, + VideoID: id, + Context: prepareInnertubeContext(*c.client), + ContentCheckOK: true, + racyCheckOk: true, + Params: "8AEB", PlaybackContext: playbackContext{ ContentPlaybackContext: contentPlaybackContext{ - SignatureTimestamp: sts, + // SignatureTimestamp: sts, + html5Preference: "HTML5_PREF_WANTS", }, }, } - return c.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/player?key="+clientInfo.key, data) + return c.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/player?key="+c.client.key, data) } func prepareInnertubeContext(clientInfo clientInfo) inntertubeContext { return inntertubeContext{ Client: innertubeClient{ - HL: "en", - GL: "US", - ClientName: clientInfo.name, - ClientVersion: clientInfo.version, + HL: "en", + GL: "US", + TimeZone: "UTC", + ClientName: clientInfo.name, + ClientVersion: clientInfo.version, + AndroidSDKVersion: clientInfo.androidVersion, + UserAgent: clientInfo.userAgent, }, } } @@ -180,10 +221,22 @@ func prepareInnertubePlaylistData(ID string, continuation bool, clientInfo clien context := prepareInnertubeContext(clientInfo) if continuation { - return innertubeRequest{Context: context, Continuation: ID} + return innertubeRequest{ + Context: context, + Continuation: ID, + ContentCheckOK: true, + racyCheckOk: true, + Params: "8AEB", + } } - return innertubeRequest{Context: context, BrowseID: "VL" + ID} + return innertubeRequest{ + Context: context, + BrowseID: "VL" + ID, + ContentCheckOK: true, + racyCheckOk: true, + Params: "8AEB", + } } // GetPlaylist fetches playlist metadata @@ -195,13 +248,15 @@ func (c *Client) GetPlaylist(url string) (*Playlist, error) { // for these videos. Playlist entries cannot be downloaded, as they lack all the required metadata, but // can be used to enumerate all IDs, Authors, Titles, etc. func (c *Client) GetPlaylistContext(ctx context.Context, url string) (*Playlist, error) { + c.client = &androidClient + id, err := extractPlaylistID(url) if err != nil { return nil, fmt.Errorf("extractPlaylistID failed: %w", err) } - data := prepareInnertubePlaylistData(id, false, webClient) - body, err := c.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/browse?key="+webClient.key, data) + data := prepareInnertubePlaylistData(id, false, *c.client) + body, err := c.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/browse?key="+c.client.key, data) if err != nil { return nil, err } @@ -243,7 +298,19 @@ func (c *Client) GetStreamContext(ctx context.Context, video *Video, format *For contentLength = c.downloadOnce(req, w, format) } else { // we have length information, let's download by chunks! - go c.downloadChunked(req, w, format) + data, err := c.downloadChunked(ctx, req, format) + if err != nil { + return nil, 0, err + } + + go func() { + if _, err := w.Write(data); err != nil { + w.CloseWithError(err) + return + } + + w.Close() //nolint:errcheck + }() } return r, contentLength, nil @@ -274,40 +341,98 @@ func (c *Client) downloadOnce(req *http.Request, w *io.PipeWriter, _ *Format) in return length } -func (c *Client) downloadChunked(req *http.Request, w *io.PipeWriter, format *Format) { - const chunkSize int64 = 10_000_000 - // Loads a chunk a returns the written bytes. - // Downloading in multiple chunks is much faster: - // https://github.com/kkdai/youtube/pull/190 - loadChunk := func(pos int64) (int64, error) { - req.Header.Set("Range", fmt.Sprintf("bytes=%v-%v", pos, pos+chunkSize-1)) +type chunkData struct { + index int + data []byte +} - resp, err := c.httpDo(req) - if err != nil { - return 0, err - } - defer resp.Body.Close() +func (c *Client) getChunkSize() int64 { + if c.ChunkSize > 0 { + return c.ChunkSize + } - if resp.StatusCode != http.StatusPartialContent { - return 0, ErrUnexpectedStatusCode(resp.StatusCode) - } + return Size1Mb +} + +func (c *Client) getMaxRoutines(limit int) int { + routines := 10 + + if c.MaxRoutines > 0 { + routines = c.MaxRoutines + } + + if limit > 0 && routines > limit { + routines = limit + } + + return routines +} - return io.Copy(w, resp.Body) +func (c *Client) downloadChunked(ctx context.Context, req *http.Request, format *Format) ([]byte, error) { + chunks := getChunks(format.ContentLength, c.getChunkSize()) + maxRoutines := c.getMaxRoutines(len(chunks)) + + chunkChan := make(chan chunk, len(chunks)) + chunkDataChan := make(chan chunkData, len(chunks)) + errChan := make(chan error, 1) + + for _, c := range chunks { + chunkChan <- c } + close(chunkChan) + + var wg sync.WaitGroup - defer w.Close() + for i := 0; i < maxRoutines; i++ { + wg.Add(1) + + go func() { + defer wg.Done() + + for { + select { + case <-ctx.Done(): + errChan <- context.DeadlineExceeded + return + case ch, open := <-chunkChan: + if !open { + return + } + + data, err := c.downloadChunk(req.Clone(ctx), ch) + if err != nil { + errChan <- err + return + } + + chunkDataChan <- chunkData{ch.index, data} + } + } + }() + } + wg.Wait() - //nolint:revive,errcheck - // load all the chunks - for pos := int64(0); pos < format.ContentLength; { - written, err := loadChunk(pos) + close(errChan) + close(chunkDataChan) + + for err := range errChan { if err != nil { - w.CloseWithError(err) - return + return nil, err } + } + + chunkDatas := make([]chunkData, len(chunks)) - pos += written + for cd := range chunkDataChan { + chunkDatas[cd.index] = cd } + + data := make([]byte, 0, format.ContentLength) + for _, chunk := range chunkDatas { + data = append(data, chunk.data...) + } + + return data, nil } // GetStreamURL returns the url for a specific format @@ -317,10 +442,17 @@ func (c *Client) GetStreamURL(video *Video, format *Format) (string, error) { // GetStreamURLContext returns the url for a specific format with a context func (c *Client) GetStreamURLContext(ctx context.Context, video *Video, format *Format) (string, error) { + if format == nil { + return "", ErrNoFormat + } + if format.URL != "" { - return c.unThrottle(ctx, video.ID, format.URL) + return format.URL, nil + // return c.unThrottle(ctx, video.ID, format.URL) } + // TODO: check rest of this function, is it redundant? + cipher := format.Cipher if cipher == "" { return "", ErrCipherNotFound @@ -345,6 +477,21 @@ func (c *Client) httpDo(req *http.Request) (*http.Response, error) { log.Println(req.Method, req.URL) } + req.Header.Set("User-Agent", c.client.userAgent) + req.Header.Set("Origin", "https://youtube.com") + req.Header.Set("Sec-Fetch-Mode", "navigate") + + if len(c.consentID) == 0 { + c.consentID = strconv.Itoa(rand.Intn(899) + 100) + } + + req.AddCookie(&http.Cookie{ + Name: "CONSENT", + Value: "YES+cb.20210328-17-p0.en+FX+" + c.consentID, + Path: "/", + Domain: ".youtube.com", + }) + res, err := client.Do(req) if c.Debug && res != nil { @@ -396,6 +543,11 @@ func (c *Client) httpPost(ctx context.Context, url string, body interface{}) (*h return nil, err } + req.Header.Set("X-Youtube-Client-Name", "3") + req.Header.Set("X-Youtube-Client-Version", c.client.version) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + resp, err := c.httpDo(req) if err != nil { return nil, err @@ -418,3 +570,29 @@ func (c *Client) httpPostBodyBytes(ctx context.Context, url string, body interfa return io.ReadAll(resp.Body) } + +// downloadChunk returns the chunk bytes. +// Downloading in multiple chunks is much faster: +// https://github.com/kkdai/youtube/pull/190 +func (c *Client) downloadChunk(req *http.Request, chunk chunk) ([]byte, error) { + q := req.URL.Query() + q.Set("range", fmt.Sprintf("%d-%d", chunk.start, chunk.end)) + req.URL.RawQuery = q.Encode() + + resp, err := c.httpDo(req) + if err != nil { + return nil, ErrUnexpectedStatusCode(resp.StatusCode) + } + defer resp.Body.Close() + + if resp.StatusCode < http.StatusOK && resp.StatusCode >= 300 { + return nil, ErrUnexpectedStatusCode(resp.StatusCode) + } + + b, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("read chunk body: %w", err) + } + + return b, nil +} diff --git a/client_test.go b/client_test.go index 3d6205e0..92df2d66 100644 --- a/client_test.go +++ b/client_test.go @@ -83,7 +83,7 @@ func TestGetVideoWithoutManifestURL(t *testing.T) { assert, require := assert.New(t), require.New(t) video, err := testClient.GetVideo(dwlURL) - require.NoError(err) + require.NoError(err, "get video") require.NotNil(video) assert.NotEmpty(video.Thumbnails) @@ -95,9 +95,11 @@ func TestGetVideoWithoutManifestURL(t *testing.T) { assert.Equal("rFejpH_tAHM", video.ID) assert.Equal("dotGo 2015 - Rob Pike - Simplicity is Complicated", video.Title) assert.Equal("dotconferences", video.Author) - assert.Equal(1392*time.Second, video.Duration) + assert.GreaterOrEqual(video.Duration, 1390*time.Second) assert.Contains(video.Description, "Go is often described as a simple language.") - assert.Equal("2015-12-02 00:00:00 +0000 UTC", video.PublishDate.String()) + + // Publishing date doesn't seem to be present in android client + // assert.Equal("2015-12-02 00:00:00 +0000 UTC", video.PublishDate.String()) } func TestGetVideoWithManifestURL(t *testing.T) { @@ -175,8 +177,10 @@ func TestGetBigPlaylist(t *testing.T) { assert.NotEmpty(playlist.Description) assert.NotEmpty(playlist.Author) - assert.Greater(len(playlist.Videos), 100) - assert.NotEmpty(playlist.Videos[100].ID) + assert.Greater(len(playlist.Videos), 300) + assert.NotEmpty(playlist.Videos[300].ID) + + t.Logf("Playlist Title: %s, Video Count: %d", playlist.Title, len(playlist.Videos)) } func TestClient_httpGetBodyBytes(t *testing.T) { diff --git a/downloader/downloader_test.go b/downloader/downloader_test.go index a94468ff..84910715 100644 --- a/downloader/downloader_test.go +++ b/downloader/downloader_test.go @@ -7,9 +7,10 @@ import ( "testing" "time" - "github.com/kkdai/youtube/v2" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + + "github.com/kkdai/youtube/v2" ) var testDownloader = func() (dl Downloader) { @@ -39,7 +40,7 @@ func TestDownload_FirstStream(t *testing.T) { assert.Equal(`youtube-dl test video "'/\ä↭𝕐`, video.Title) assert.Equal(`Philipp Hagemeister`, video.Author) assert.Equal(10*time.Second, video.Duration) - assert.Len(video.Formats, 18) + assert.GreaterOrEqual(len(video.Formats), 18) if assert.Greater(len(video.Formats), 0) { assert.NoError(testDownloader.Download(ctx, video, &video.Formats[0], "")) diff --git a/format_list.go b/format_list.go index e2a0ae7e..827d2f0d 100644 --- a/format_list.go +++ b/format_list.go @@ -9,6 +9,8 @@ import ( type FormatList []Format // FindByQuality returns the first format matching Quality or QualityLabel +// +// Examples: tiny, small, medium, large, 720p, hd720, hd1080 func (list FormatList) FindByQuality(quality string) *Format { for i := range list { if list[i].Quality == quality || list[i].QualityLabel == quality { diff --git a/itag_test.go b/itag_test.go index 66e83bcb..bb919ce2 100644 --- a/itag_test.go +++ b/itag_test.go @@ -14,5 +14,5 @@ func TestYoutube_GetItagInfo(t *testing.T) { url := "https://www.youtube.com/watch?v=rFejpH_tAHM" video, err := client.GetVideo(url) require.NoError(err) - require.Len(video.Formats, 24) + require.GreaterOrEqual(len(video.Formats), 24) } diff --git a/playlist.go b/playlist.go index 4b73488d..12abca83 100644 --- a/playlist.go +++ b/playlist.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "regexp" + "runtime/debug" "strconv" "time" @@ -68,8 +69,9 @@ func (p *Playlist) parsePlaylistInfo(ctx context.Context, client *Client, body [ } defer func() { + stack := debug.Stack() if r := recover(); r != nil { - err = fmt.Errorf("JSON parsing error: %v", r) + err = fmt.Errorf("JSON parsing error: %v\n%s", r, stack) } }() @@ -80,27 +82,70 @@ func (p *Playlist) parsePlaylistInfo(ctx context.Context, client *Client, body [ return ErrPlaylistStatus{Reason: message} } - p.Title = j.GetPath("metadata", "playlistMetadataRenderer", "title").MustString() - p.Description = j.GetPath("metadata", "playlistMetadataRenderer", "description").MustString() + // Metadata can be located in multiple places depending on client type + var metadata *sjson.Json + if node, ok := j.CheckGet("metadata"); ok { + metadata = node + } else if node, ok := j.CheckGet("header"); ok { + metadata = node + } else { + return fmt.Errorf("no playlist header / metadata found") + } + + metadata = metadata.Get("playlistHeaderRenderer") + + p.Title = getText(metadata, "title") + p.Description = getText(metadata, "description", "descriptionText") p.Author = j.GetPath("sidebar", "playlistSidebarRenderer", "items").GetIndex(1). GetPath("playlistSidebarSecondaryInfoRenderer", "videoOwner", "videoOwnerRenderer", "title", "runs"). GetIndex(0).Get("text").MustString() - vJSON, err := j.GetPath("contents", "twoColumnBrowseResultsRenderer", "tabs").GetIndex(0). - GetPath("tabRenderer", "content", "sectionListRenderer", "contents").GetIndex(0). - GetPath("itemSectionRenderer", "contents").GetIndex(0). - GetPath("playlistVideoListRenderer", "contents").MarshalJSON() + + if len(p.Author) == 0 { + p.Author = getText(metadata, "owner", "ownerText") + } + + contents, ok := j.CheckGet("contents") + if !ok { + return fmt.Errorf("contents not found in json body") + } + + // contents can have different keys with same child structure + firstPart := getFistKey(contents).GetPath("tabs").GetIndex(0). + GetPath("tabRenderer", "content", "sectionListRenderer", "contents").GetIndex(0) + + // This extra nested item is only set with the web client + if n := firstPart.GetPath("itemSectionRenderer", "contents").GetIndex(0); isValid(n) { + firstPart = n + } + + vJSON, err := firstPart.GetPath("playlistVideoListRenderer", "contents").MarshalJSON() + if err != nil { + return err + } + + if len(vJSON) <= 4 { + return fmt.Errorf("no video data found in JSON") + } entries, continuation, err := extractPlaylistEntries(vJSON) if err != nil { return err } + if len(continuation) == 0 { + continuation = getContinuation(firstPart.Get("playlistVideoListRenderer")) + } + + if len(entries) == 0 { + return fmt.Errorf("no videos found in playlist") + } + p.Videos = entries for continuation != "" { - data := prepareInnertubePlaylistData(continuation, true, webClient) + data := prepareInnertubePlaylistData(continuation, true, *client.client) - body, err := client.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/browse?key="+webClient.key, data) + body, err := client.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/browse?key="+client.client.key, data) if err != nil { return err } @@ -110,9 +155,15 @@ func (p *Playlist) parsePlaylistInfo(ctx context.Context, client *Client, body [ return err } - vJSON, err := j.GetPath("onResponseReceivedActions").GetIndex(0). - GetPath("appendContinuationItemsAction", "continuationItems").MarshalJSON() + var next *sjson.Json + if next = j.GetPath("onResponseReceivedActions").GetIndex(0). + GetPath("appendContinuationItemsAction", "continuationItems"); isValid(next) { + } else if next = j.GetPath("continuationContents", "playlistVideoListContinuation", "contents"); isValid(next) { + } else { + return fmt.Errorf("failed to extract continuation data") + } + vJSON, err := next.MarshalJSON() if err != nil { return err } @@ -122,7 +173,13 @@ func (p *Playlist) parsePlaylistInfo(ctx context.Context, client *Client, body [ return err } - p.Videos, continuation = append(p.Videos, entries...), token + if len(token) > 0 { + continuation = token + } else { + continuation = getContinuation(j.GetPath("continuationContents", "playlistVideoListContinuation")) + } + + p.Videos = append(p.Videos, entries...) } return err diff --git a/utils.go b/utils.go new file mode 100644 index 00000000..1df604cd --- /dev/null +++ b/utils.go @@ -0,0 +1,105 @@ +package youtube + +import ( + "math" + + sjson "github.com/bitly/go-simplejson" +) + +type chunk struct { + index int + start int64 + end int64 +} + +func getChunks(totalSize, chunkSize int64) []chunk { + var chunks []chunk + + for i := 0; i < int(math.Ceil(float64(totalSize)/float64(chunkSize))); i++ { + start := int64(i) * chunkSize + end := start + chunkSize - 1 + if end >= totalSize { + end = totalSize - 1 + } + + chunks = append(chunks, chunk{i, start, end}) + } + + return chunks +} + +func getFistKey(j *sjson.Json) *sjson.Json { + m, err := j.Map() + if err != nil { + return j + } + + for key := range m { + return j.Get(key) + } + + return j +} + +func isValid(j *sjson.Json) bool { + b, err := j.MarshalJSON() + if err != nil { + return false + } + + if len(b) <= 4 { + return false + } + + return true +} + +func getText(j *sjson.Json, paths ...string) string { + for _, path := range paths { + if isValid(j.Get(path)) { + j = j.Get(path) + } + } + + if text, err := j.String(); err == nil { + return text + } + + if isValid(j.Get("text")) { + return j.Get("text").MustString() + } + + if p := j.Get("runs"); isValid(p) { + var text string + + for i := 0; i < len(p.MustArray()); i++ { + if textNode := p.GetIndex(i).Get("text"); isValid(textNode) { + text += textNode.MustString() + } + } + + return text + } + + return "" +} + +func getKeys(j *sjson.Json) []string { + var keys []string + + m, err := j.Map() + if err != nil { + return keys + } + + for key := range m { + keys = append(keys, key) + } + + return keys +} + +func getContinuation(j *sjson.Json) string { + return j.GetPath("continuations"). + GetIndex(0).GetPath("nextContinuationData", "continuation").MustString() +} diff --git a/video.go b/video.go index 4bb7019c..b150fca7 100644 --- a/video.go +++ b/video.go @@ -103,6 +103,10 @@ func (v *Video) extractDataFromPlayerResponse(prData playerResponseData) error { v.Views = views } + if seconds, _ := strconv.Atoi(prData.VideoDetails.LengthSeconds); seconds > 0 { + v.Duration = time.Duration(seconds) * time.Second + } + if seconds, _ := strconv.Atoi(prData.Microformat.PlayerMicroformatRenderer.LengthSeconds); seconds > 0 { v.Duration = time.Duration(seconds) * time.Second } diff --git a/video_id.go b/video_id.go index bc7f1eb6..7de09cb6 100644 --- a/video_id.go +++ b/video_id.go @@ -25,6 +25,7 @@ func ExtractVideoID(videoID string) (string, error) { if strings.ContainsAny(videoID, "?&/<%=") { return "", ErrInvalidCharactersInVideoID } + if len(videoID) < 10 { return "", ErrVideoIDMinLength } diff --git a/video_test.go b/video_test.go index c9e15c0a..668832ed 100644 --- a/video_test.go +++ b/video_test.go @@ -1,15 +1,49 @@ package youtube import ( + "fmt" + "io" "testing" + "time" "github.com/stretchr/testify/require" ) +func TestChunk(t *testing.T) { + fmt.Println(getChunks(100, 10)) +} + +func TestSimpleTest(t *testing.T) { + client := Client{Debug: true, ChunkSize: Size10Mb} + + video, err := client.GetVideo("https://www.youtube.com/watch?v=BaW_jenozKc") + if err != nil { + panic(err) + } + + // Typically youtube only provides separate streams for video and audio. + // If you want audio and video combined, take a look a the downloader package. + format := video.Formats.FindByQuality("hd1080") + + start := time.Now() + reader, _, err := client.GetStream(video, format) + require.NoError(t, err, "get stream") + + fmt.Println("Duration Milliseconds: ", time.Since(start).Milliseconds()) + + // do something with the reader + b, err := io.ReadAll(reader) + if err != nil { + panic(err) + } + + fmt.Println("Downloaded ", len(b)) +} + func ExampleClient_GetStream() { client := Client{Debug: true} - video, err := client.GetVideo("https://www.youtube.com/watch?v=BaW_jenozKc") + video, err := client.GetVideo("https://www.youtube.com/watch?v=9_MbW9FK1fA") if err != nil { panic(err) } From 3509e5f29389f46bdcebe51f97b0abcb16acc933 Mon Sep 17 00:00:00 2001 From: Davincible Date: Wed, 26 Apr 2023 02:02:30 +0200 Subject: [PATCH 2/4] fix: cli --- cmd/youtubedr/downloader.go | 7 ++++--- fetch_testdata_helper.go | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cmd/youtubedr/downloader.go b/cmd/youtubedr/downloader.go index 657608e8..3a478940 100644 --- a/cmd/youtubedr/downloader.go +++ b/cmd/youtubedr/downloader.go @@ -11,10 +11,11 @@ import ( "strconv" "time" - "github.com/kkdai/youtube/v2" - ytdl "github.com/kkdai/youtube/v2/downloader" "github.com/spf13/pflag" "golang.org/x/net/http/httpproxy" + + "github.com/kkdai/youtube/v2" + ytdl "github.com/kkdai/youtube/v2/downloader" ) var ( @@ -94,7 +95,7 @@ func getVideoWithFormat(id string) (*youtube.Video, *youtube.Format, error) { } case outputQuality != "": - format = formats.WithAudioChannels().FindByQuality(outputQuality) + format = formats.FindByQuality(outputQuality) if format == nil { return nil, nil, fmt.Errorf("unable to find format with quality %s", outputQuality) } diff --git a/fetch_testdata_helper.go b/fetch_testdata_helper.go index 9d106729..989970c3 100644 --- a/fetch_testdata_helper.go +++ b/fetch_testdata_helper.go @@ -1,3 +1,4 @@ +//go:build fetch // +build fetch package youtube From cd2688178e6ae767f135c7904582288fe6c4593e Mon Sep 17 00:00:00 2001 From: Davincible Date: Wed, 26 Apr 2023 12:05:32 +0200 Subject: [PATCH 3/4] fix: default download size 1Mb > 10Mb --- client.go | 2 +- video_test.go | 49 ++++++++++++++++++++----------------------------- 2 files changed, 21 insertions(+), 30 deletions(-) diff --git a/client.go b/client.go index 1a574f2e..641f95a4 100644 --- a/client.go +++ b/client.go @@ -351,7 +351,7 @@ func (c *Client) getChunkSize() int64 { return c.ChunkSize } - return Size1Mb + return Size10Mb } func (c *Client) getMaxRoutines(limit int) int { diff --git a/video_test.go b/video_test.go index 668832ed..fecf8d7a 100644 --- a/video_test.go +++ b/video_test.go @@ -1,7 +1,6 @@ package youtube import ( - "fmt" "io" "testing" "time" @@ -9,56 +8,48 @@ import ( "github.com/stretchr/testify/require" ) -func TestChunk(t *testing.T) { - fmt.Println(getChunks(100, 10)) -} - -func TestSimpleTest(t *testing.T) { - client := Client{Debug: true, ChunkSize: Size10Mb} +func ExampleClient_GetStream() { + client := Client{Debug: true} - video, err := client.GetVideo("https://www.youtube.com/watch?v=BaW_jenozKc") + video, err := client.GetVideo("https://www.youtube.com/watch?v=9_MbW9FK1fA") if err != nil { panic(err) } // Typically youtube only provides separate streams for video and audio. // If you want audio and video combined, take a look a the downloader package. - format := video.Formats.FindByQuality("hd1080") - - start := time.Now() + format := video.Formats.FindByQuality("medium") reader, _, err := client.GetStream(video, format) - require.NoError(t, err, "get stream") - - fmt.Println("Duration Milliseconds: ", time.Since(start).Milliseconds()) - - // do something with the reader - b, err := io.ReadAll(reader) if err != nil { panic(err) } - fmt.Println("Downloaded ", len(b)) + // do something with the reader + + reader.Close() } -func ExampleClient_GetStream() { - client := Client{Debug: true} +func TestSimpleTest(t *testing.T) { + client := Client{Debug: true, ChunkSize: Size10Mb} - video, err := client.GetVideo("https://www.youtube.com/watch?v=9_MbW9FK1fA") - if err != nil { - panic(err) - } + video, err := client.GetVideo("https://www.youtube.com/watch?v=BaW_jenozKc") + require.NoError(t, err, "get body") // Typically youtube only provides separate streams for video and audio. // If you want audio and video combined, take a look a the downloader package. - format := video.Formats.FindByQuality("medium") + format := video.Formats.FindByQuality("hd1080") + + start := time.Now() reader, _, err := client.GetStream(video, format) - if err != nil { - panic(err) - } + require.NoError(t, err, "get stream") + + t.Log("Duration Milliseconds: ", time.Since(start).Milliseconds()) // do something with the reader + b, err := io.ReadAll(reader) + require.NoError(t, err, "read body") - reader.Close() + t.Log("Downloaded ", len(b)) } func TestDownload_Regular(t *testing.T) { From 3d2aef90aae3451b5ab195cf60773836254a0cb8 Mon Sep 17 00:00:00 2001 From: Davincible Date: Wed, 26 Apr 2023 23:05:03 +0200 Subject: [PATCH 4/4] feat: integrate transcripts --- client.go | 86 +++++++++++++----- decipher.go | 9 -- player_parse.go | 4 - playlist.go | 21 +++-- transcript.go | 214 +++++++++++++++++++++++++++++++++++++++++++++ transcript_test.go | 32 +++++++ utils.go | 38 ++++---- video_test.go | 5 +- 8 files changed, 339 insertions(+), 70 deletions(-) create mode 100644 transcript.go create mode 100644 transcript_test.go diff --git a/client.go b/client.go index 641f95a4..3cda4dc9 100644 --- a/client.go +++ b/client.go @@ -10,6 +10,7 @@ import ( "log" "math/rand" "net/http" + "net/url" "strconv" "sync" ) @@ -24,6 +25,9 @@ var ( ErrNoFormat = errors.New("no video format provided") ) +// DefaultClient type to use. No reason to change but you could if you wanted to. +var DefaultClient = AndroidClient + // Client offers methods to download video metadata and video streams. type Client struct { // Debug enables debugging output through log package @@ -47,6 +51,12 @@ type Client struct { consentID string } +func (c *Client) assureClient() { + if c.client == nil { + c.client = &DefaultClient + } +} + // GetVideo fetches video metadata func (c *Client) GetVideo(url string) (*Video, error) { return c.GetVideoContext(context.Background(), url) @@ -63,7 +73,7 @@ func (c *Client) GetVideoContext(ctx context.Context, url string) (*Video, error } func (c *Client) videoFromID(ctx context.Context, id string) (*Video, error) { - c.client = &androidClient + c.assureClient() body, err := c.videoDataByInnertube(ctx, id) if err != nil { @@ -92,7 +102,7 @@ func (c *Client) videoFromID(ctx context.Context, id string) (*Video, error) { // If the uploader marked the video as inappropriate for some ages, use embed player if errors.Is(err, ErrLoginRequired) { - c.client = &embeddedClient + c.client = &EmbeddedClient bodyEmbed, errEmbed := c.videoDataByInnertube(ctx, id) if errEmbed == nil { @@ -121,9 +131,9 @@ type innertubeRequest struct { BrowseID string `json:"browseId,omitempty"` Continuation string `json:"continuation,omitempty"` Context inntertubeContext `json:"context"` - PlaybackContext playbackContext `json:"playbackContext,omitempty"` - ContentCheckOK bool `json:"contentCheckOk"` - racyCheckOk bool `json:"racyCheckOk"` + PlaybackContext *playbackContext `json:"playbackContext,omitempty"` + ContentCheckOK bool `json:"contentCheckOk,omitempty"` + RacyCheckOk bool `json:"racyCheckOk,omitempty"` Params string `json:"params"` } @@ -133,7 +143,7 @@ type playbackContext struct { type contentPlaybackContext struct { // SignatureTimestamp string `json:"signatureTimestamp"` - html5Preference string `json:"html5Preference"` + HTML5Preference string `json:"html5Preference"` } type inntertubeContext struct { @@ -161,15 +171,16 @@ type clientInfo struct { } var ( - // might add ANDROID and other in future, but i don't see reason yet - webClient = clientInfo{ + // WebClient, better to use Android client but go ahead. + WebClient = clientInfo{ name: "WEB", version: "2.20210617.01.00", key: "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", } - androidClient = clientInfo{ + // AndroidClient, download go brrrrrr. + AndroidClient = clientInfo{ name: "ANDROID", version: "17.31.35", key: "AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w", @@ -177,7 +188,8 @@ var ( androidVersion: 30, } - embeddedClient = clientInfo{ + // EmbeddedClient, not really tested. + EmbeddedClient = clientInfo{ name: "WEB_EMBEDDED_PLAYER", version: "1.19700101", key: "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", // seems like same key works for both clients @@ -190,12 +202,12 @@ func (c *Client) videoDataByInnertube(ctx context.Context, id string) ([]byte, e VideoID: id, Context: prepareInnertubeContext(*c.client), ContentCheckOK: true, - racyCheckOk: true, + RacyCheckOk: true, Params: "8AEB", - PlaybackContext: playbackContext{ + PlaybackContext: &playbackContext{ ContentPlaybackContext: contentPlaybackContext{ // SignatureTimestamp: sts, - html5Preference: "HTML5_PREF_WANTS", + HTML5Preference: "HTML5_PREF_WANTS", }, }, } @@ -203,6 +215,15 @@ func (c *Client) videoDataByInnertube(ctx context.Context, id string) ([]byte, e return c.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/player?key="+c.client.key, data) } +func (c *Client) transcriptDataByInnertube(ctx context.Context, id string) ([]byte, error) { + data := innertubeRequest{ + Context: prepareInnertubeContext(*c.client), + Params: transcriptVideoID(id), + } + + return c.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/get_transcript?key="+c.client.key, data) +} + func prepareInnertubeContext(clientInfo clientInfo) inntertubeContext { return inntertubeContext{ Client: innertubeClient{ @@ -225,7 +246,7 @@ func prepareInnertubePlaylistData(ID string, continuation bool, clientInfo clien Context: context, Continuation: ID, ContentCheckOK: true, - racyCheckOk: true, + RacyCheckOk: true, Params: "8AEB", } } @@ -234,11 +255,27 @@ func prepareInnertubePlaylistData(ID string, continuation bool, clientInfo clien Context: context, BrowseID: "VL" + ID, ContentCheckOK: true, - racyCheckOk: true, + RacyCheckOk: true, Params: "8AEB", } } +// transcriptVideoID encodes the video ID to the param used to fetch transcripts. +func transcriptVideoID(videoID string) string { + langCode := encTranscriptLang("en") + + // This can be optionally appened to the Sprintf str, not sure what it means + // *3engagement-panel-searchable-transcript-search-panel\x30\x00\x38\x01\x40\x01 + return base64Enc(fmt.Sprintf("\n\x0b%s\x12\x12%s\x18\x01", videoID, langCode)) +} + +func encTranscriptLang(languageCode string) string { + s := fmt.Sprintf("\n\x03asr\x12\x02%s\x1a\x00", languageCode) + s = base64PadEnc(s) + + return url.QueryEscape(s) +} + // GetPlaylist fetches playlist metadata func (c *Client) GetPlaylist(url string) (*Playlist, error) { return c.GetPlaylistContext(context.Background(), url) @@ -248,7 +285,7 @@ func (c *Client) GetPlaylist(url string) (*Playlist, error) { // for these videos. Playlist entries cannot be downloaded, as they lack all the required metadata, but // can be used to enumerate all IDs, Authors, Titles, etc. func (c *Client) GetPlaylistContext(ctx context.Context, url string) (*Playlist, error) { - c.client = &androidClient + c.assureClient() id, err := extractPlaylistID(url) if err != nil { @@ -319,8 +356,7 @@ func (c *Client) GetStreamContext(ctx context.Context, video *Video, format *For func (c *Client) downloadOnce(req *http.Request, w *io.PipeWriter, _ *Format) int64 { resp, err := c.httpDo(req) if err != nil { - //nolint:errcheck - w.CloseWithError(err) + w.CloseWithError(err) //nolint:errcheck return 0 } @@ -330,8 +366,7 @@ func (c *Client) downloadOnce(req *http.Request, w *io.PipeWriter, _ *Format) in if err == nil { w.Close() } else { - //nolint:errcheck - w.CloseWithError(err) + w.CloseWithError(err) //nolint:errcheck } }() @@ -447,8 +482,11 @@ func (c *Client) GetStreamURLContext(ctx context.Context, video *Video, format * } if format.URL != "" { - return format.URL, nil - // return c.unThrottle(ctx, video.ID, format.URL) + if c.client.androidVersion > 0 { + return format.URL, nil + } + + return c.unThrottle(ctx, video.ID, format.URL) } // TODO: check rest of this function, is it redundant? @@ -482,7 +520,7 @@ func (c *Client) httpDo(req *http.Request) (*http.Response, error) { req.Header.Set("Sec-Fetch-Mode", "navigate") if len(c.consentID) == 0 { - c.consentID = strconv.Itoa(rand.Intn(899) + 100) + c.consentID = strconv.Itoa(rand.Intn(899) + 100) //nolint:gosec } req.AddCookie(&http.Cookie{ @@ -517,6 +555,7 @@ func (c *Client) httpGet(ctx context.Context, url string) (*http.Response, error resp.Body.Close() return nil, ErrUnexpectedStatusCode(resp.StatusCode) } + return resp, nil } @@ -557,6 +596,7 @@ func (c *Client) httpPost(ctx context.Context, url string, body interface{}) (*h resp.Body.Close() return nil, ErrUnexpectedStatusCode(resp.StatusCode) } + return resp, nil } diff --git a/decipher.go b/decipher.go index c5121752..a8ce33da 100644 --- a/decipher.go +++ b/decipher.go @@ -283,12 +283,3 @@ func (config playerConfig) parseDecipherOps() (operations []DecipherOperation, e } return ops, nil } - -func (config playerConfig) getSignatureTimestamp() (string, error) { - result := signatureRegexp.FindSubmatch(config) - if result == nil { - return "", ErrSignatureTimestampNotFound - } - - return string(result[1]), nil -} diff --git a/player_parse.go b/player_parse.go index cf551aec..7790eeb2 100644 --- a/player_parse.go +++ b/player_parse.go @@ -15,11 +15,7 @@ type playerConfig []byte var basejsPattern = regexp.MustCompile(`(/s/player/\w+/player_ias.vflset/\w+/base.js)`) -// we may use \d{5} instead of \d+ since currently its 5 digits, but i can't be sure it will be 5 digits always -var signatureRegexp = regexp.MustCompile(`(?m)(?:^|,)(?:signatureTimestamp:)(\d+)`) - func (c *Client) getPlayerConfig(ctx context.Context, videoID string) (playerConfig, error) { - embedURL := fmt.Sprintf("https://youtube.com/embed/%s?hl=en", videoID) embedBody, err := c.httpGetBodyBytes(ctx, embedURL) if err != nil { diff --git a/playlist.go b/playlist.go index 12abca83..60462f56 100644 --- a/playlist.go +++ b/playlist.go @@ -94,14 +94,14 @@ func (p *Playlist) parsePlaylistInfo(ctx context.Context, client *Client, body [ metadata = metadata.Get("playlistHeaderRenderer") - p.Title = getText(metadata, "title") - p.Description = getText(metadata, "description", "descriptionText") + p.Title = sjsonGetText(metadata, "title") + p.Description = sjsonGetText(metadata, "description", "descriptionText") p.Author = j.GetPath("sidebar", "playlistSidebarRenderer", "items").GetIndex(1). GetPath("playlistSidebarSecondaryInfoRenderer", "videoOwner", "videoOwnerRenderer", "title", "runs"). GetIndex(0).Get("text").MustString() if len(p.Author) == 0 { - p.Author = getText(metadata, "owner", "ownerText") + p.Author = sjsonGetText(metadata, "owner", "ownerText") } contents, ok := j.CheckGet("contents") @@ -110,11 +110,11 @@ func (p *Playlist) parsePlaylistInfo(ctx context.Context, client *Client, body [ } // contents can have different keys with same child structure - firstPart := getFistKey(contents).GetPath("tabs").GetIndex(0). + firstPart := getFirstKeyJSON(contents).GetPath("tabs").GetIndex(0). GetPath("tabRenderer", "content", "sectionListRenderer", "contents").GetIndex(0) // This extra nested item is only set with the web client - if n := firstPart.GetPath("itemSectionRenderer", "contents").GetIndex(0); isValid(n) { + if n := firstPart.GetPath("itemSectionRenderer", "contents").GetIndex(0); isValidJSON(n) { firstPart = n } @@ -155,12 +155,11 @@ func (p *Playlist) parsePlaylistInfo(ctx context.Context, client *Client, body [ return err } - var next *sjson.Json - if next = j.GetPath("onResponseReceivedActions").GetIndex(0). - GetPath("appendContinuationItemsAction", "continuationItems"); isValid(next) { - } else if next = j.GetPath("continuationContents", "playlistVideoListContinuation", "contents"); isValid(next) { - } else { - return fmt.Errorf("failed to extract continuation data") + next := j.GetPath("onResponseReceivedActions").GetIndex(0). + GetPath("appendContinuationItemsAction", "continuationItems") + + if !isValidJSON(next) { + next = j.GetPath("continuationContents", "playlistVideoListContinuation", "contents") } vJSON, err := next.MarshalJSON() diff --git a/transcript.go b/transcript.go new file mode 100644 index 00000000..84571f45 --- /dev/null +++ b/transcript.go @@ -0,0 +1,214 @@ +package youtube + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "strconv" + "strings" +) + +var ( + ErrTranscriptDisabled = errors.New("transcript is disabled on this video") +) + +// TranscriptSegment is a single transcipt segment spanning a few milliseconds. +type TranscriptSegment struct { + // Text is the transcipt text. + Text string `json:"text"` + + // StartMs is the start timestamp in ms. + StartMs int `json:"offset"` + + // OffsetText e.g. '4:00'. + OffsetText string `json:"offsetText"` + + // Duration the transcript segment spans in ms. + Duration int `json:"duration"` +} + +func (tr TranscriptSegment) String() string { + return tr.OffsetText + " - " + strings.TrimSpace(tr.Text) +} + +type VideoTranscript []TranscriptSegment + +func (vt VideoTranscript) String() string { + var str string + for _, tr := range vt { + str += tr.String() + "\n" + } + + return str +} + +// GetTranscript fetches the video transcript if available. +// +// Not all videos have transcripts, only relatively new videos. +// If transcripts are disabled or not available, ErrTranscriptDisabled is returned. +func (c *Client) GetTranscript(video *Video) (VideoTranscript, error) { + return c.GetTranscriptCtx(context.Background(), video) +} + +// GetTranscriptCtx fetches the video transcript if available. +// +// Not all videos have transcripts, only relatively new videos. +// If transcripts are disabled or not available, ErrTranscriptDisabled is returned. +func (c *Client) GetTranscriptCtx(ctx context.Context, video *Video) (VideoTranscript, error) { + c.assureClient() + + if video == nil || video.ID == "" { + return nil, fmt.Errorf("no video provided") + } + + body, err := c.transcriptDataByInnertube(ctx, video.ID) + if err != nil { + return nil, err + } + + transcript, err := parseTranscript(body) + if err != nil { + return nil, err + } + + return transcript, nil +} + +func parseTranscript(body []byte) (VideoTranscript, error) { + var resp transcriptResp + if err := json.Unmarshal(body, &resp); err != nil { + return nil, err + } + + if len(resp.Actions) > 0 { + // Android client response + if app := resp.Actions[0].AppSegment; app != nil { + return getSegments(app) + } + + // Web client response + if web := resp.Actions[0].WebSegment; web != nil { + return nil, fmt.Errorf("not implemented") + } + } + + return nil, ErrTranscriptDisabled +} + +type segmenter interface { + ParseSegments() []TranscriptSegment +} + +func getSegments(f segmenter) (VideoTranscript, error) { + if segments := f.ParseSegments(); len(segments) > 0 { + return segments, nil + } + + return nil, ErrTranscriptDisabled +} + +// transcriptResp is the JSON structure as returned by the transcript API. +type transcriptResp struct { + Actions []struct { + AppSegment *appData `json:"elementsCommand"` + WebSegment *webData `json:"updateEngagementPanelAction"` + } `json:"actions"` +} + +type appData struct { + TEC struct { + Args struct { + ListArgs struct { + Ow struct { + InitialSeg []struct { + TranscriptSegment struct { + StartMs string `json:"startMs"` + EndMs string `json:"endMs"` + Text struct { + String struct { + // Content is the actual transctipt text + Content string `json:"content"` + } `json:"elementsAttributedString"` + } `json:"snippet"` + StartTimeText struct { + String struct { + // Content is the fomratted timestamp, e.g. '4:00' + Content string `json:"content"` + } `json:"elementsAttributedString"` + } `json:"startTimeText"` + } `json:"transcriptSegmentRenderer"` + } `json:"initialSegments"` + } `json:"overwrite"` + } `json:"transformTranscriptSegmentListArguments"` + } `json:"arguments"` + } `json:"transformEntityCommand"` +} + +func (s *appData) ParseSegments() []TranscriptSegment { + rawSegments := s.TEC.Args.ListArgs.Ow.InitialSeg + segments := make([]TranscriptSegment, 0, len(rawSegments)) + + for _, segment := range rawSegments { + startMs, _ := strconv.Atoi(segment.TranscriptSegment.StartMs) + endMs, _ := strconv.Atoi(segment.TranscriptSegment.EndMs) + + segments = append(segments, TranscriptSegment{ + Text: segment.TranscriptSegment.Text.String.Content, + StartMs: startMs, + OffsetText: segment.TranscriptSegment.StartTimeText.String.Content, + Duration: endMs - startMs, + }) + } + + return segments +} + +type webData struct { + Content struct { + TR struct { + Body struct { + TBR struct { + Cues []struct { + Transcript struct { + FormattedStartOffset struct { + SimpleText string `json:"simpleText"` + } `json:"formattedStartOffset"` + Cues []struct { + TranscriptCueRenderer struct { + Cue struct { + SimpleText string `json:"simpleText"` + } `json:"cue"` + StartOffsetMs string `json:"startOffsetMs"` + DurationMs string `json:"durationMs"` + } `json:"transcriptCueRenderer"` + } `json:"cues"` + } `json:"transcriptCueGroupRenderer"` + } `json:"cueGroups"` + } `json:"transcriptSearchPanelRenderer"` + } `json:"content"` + } `json:"transcriptRenderer"` + } `json:"content"` +} + +func (s *webData) ParseSegments() []TranscriptSegment { + // TODO: doesn't actually work now, check json. + cues := s.Content.TR.Body.TBR.Cues + segments := make([]TranscriptSegment, 0, len(cues)) + + for _, s := range cues { + formatted := s.Transcript.FormattedStartOffset.SimpleText + segment := s.Transcript.Cues[0].TranscriptCueRenderer + start, _ := strconv.Atoi(segment.StartOffsetMs) + duration, _ := strconv.Atoi(segment.DurationMs) + + segments = append(segments, TranscriptSegment{ + Text: segment.Cue.SimpleText, + StartMs: start, + OffsetText: formatted, + Duration: duration, + }) + } + + return segments +} diff --git a/transcript_test.go b/transcript_test.go new file mode 100644 index 00000000..748a3610 --- /dev/null +++ b/transcript_test.go @@ -0,0 +1,32 @@ +package youtube + +import ( + "strconv" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestTranscript(t *testing.T) { + client := Client{Debug: true} + + video := &Video{ID: "9_MbW9FK1fA"} + + transcript, err := client.GetTranscript(video) + require.NoError(t, err, "get transcript") + require.Greater(t, len(transcript), 0, "no transcript segments found") + + for i, segment := range transcript { + index := strconv.Itoa(i) + + require.NotEmpty(t, segment.Text, "text "+index) + require.NotEmpty(t, segment.Duration, "duration "+index) + require.NotEmpty(t, segment.OffsetText, "offset "+index) + + if i != 0 { + require.NotEmpty(t, segment.StartMs, "startMs "+index) + } + } + + t.Log(transcript.String()) +} diff --git a/utils.go b/utils.go index 1df604cd..4704a2aa 100644 --- a/utils.go +++ b/utils.go @@ -1,6 +1,7 @@ package youtube import ( + "encoding/base64" "math" sjson "github.com/bitly/go-simplejson" @@ -28,7 +29,7 @@ func getChunks(totalSize, chunkSize int64) []chunk { return chunks } -func getFistKey(j *sjson.Json) *sjson.Json { +func getFirstKeyJSON(j *sjson.Json) *sjson.Json { m, err := j.Map() if err != nil { return j @@ -41,7 +42,7 @@ func getFistKey(j *sjson.Json) *sjson.Json { return j } -func isValid(j *sjson.Json) bool { +func isValidJSON(j *sjson.Json) bool { b, err := j.MarshalJSON() if err != nil { return false @@ -54,9 +55,9 @@ func isValid(j *sjson.Json) bool { return true } -func getText(j *sjson.Json, paths ...string) string { +func sjsonGetText(j *sjson.Json, paths ...string) string { for _, path := range paths { - if isValid(j.Get(path)) { + if isValidJSON(j.Get(path)) { j = j.Get(path) } } @@ -65,15 +66,15 @@ func getText(j *sjson.Json, paths ...string) string { return text } - if isValid(j.Get("text")) { + if isValidJSON(j.Get("text")) { return j.Get("text").MustString() } - if p := j.Get("runs"); isValid(p) { + if p := j.Get("runs"); isValidJSON(p) { var text string for i := 0; i < len(p.MustArray()); i++ { - if textNode := p.GetIndex(i).Get("text"); isValid(textNode) { + if textNode := p.GetIndex(i).Get("text"); isValidJSON(textNode) { text += textNode.MustString() } } @@ -84,22 +85,15 @@ func getText(j *sjson.Json, paths ...string) string { return "" } -func getKeys(j *sjson.Json) []string { - var keys []string - - m, err := j.Map() - if err != nil { - return keys - } - - for key := range m { - keys = append(keys, key) - } - - return keys -} - func getContinuation(j *sjson.Json) string { return j.GetPath("continuations"). GetIndex(0).GetPath("nextContinuationData", "continuation").MustString() } + +func base64PadEnc(str string) string { + return base64.StdEncoding.EncodeToString([]byte(str)) +} + +func base64Enc(str string) string { + return base64.RawStdEncoding.EncodeToString([]byte(str)) +} diff --git a/video_test.go b/video_test.go index fecf8d7a..9105310e 100644 --- a/video_test.go +++ b/video_test.go @@ -32,9 +32,12 @@ func ExampleClient_GetStream() { func TestSimpleTest(t *testing.T) { client := Client{Debug: true, ChunkSize: Size10Mb} - video, err := client.GetVideo("https://www.youtube.com/watch?v=BaW_jenozKc") + video, err := client.GetVideo("https://www.youtube.com/watch?v=9_MbW9FK1fA") require.NoError(t, err, "get body") + _, err = client.GetTranscript(video) + require.NoError(t, err, "get transcript") + // Typically youtube only provides separate streams for video and audio. // If you want audio and video combined, take a look a the downloader package. format := video.Formats.FindByQuality("hd1080")