From 5a0baaf91e26d9322e18185075509564d6add35a Mon Sep 17 00:00:00 2001 From: Alex Ferrari Date: Tue, 23 Jan 2024 13:37:54 +0100 Subject: [PATCH] Implement download resume --- README.md | 6 +++++- cmd/download.go | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- cmd/version.go | 2 +- 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9916f36..ae63302 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,10 @@ Use "sbstck-dl [command] --help" for more information about a command. You can provide the url of a single post or the main url of the Substack you want to download. +By providing the main URL of a Substack, the downloader will download all the posts of the archive. + +When downloading the full archive, if the downloader is interrupted, at the next execution it will resume the download of the remaining posts. + ```bash Usage: sbstck-dl download [flags] @@ -104,7 +108,6 @@ sbstck-dl download --url https://example.substack.com --cookie_name substack.sid ## TODO -- [ ] Implementing resuming downloads - [ ] Improve retry logic - [ ] Implement loading from config file - [ ] Add support for downloading media @@ -113,3 +116,4 @@ sbstck-dl download --url https://example.substack.com --cookie_name substack.sid - [x] Add documentation - [x] Add support for private newsletters - [x] Implement filtering by date +- [x] Implement resuming downloads diff --git a/cmd/download.go b/cmd/download.go index beab8ec..fa210df 100644 --- a/cmd/download.go +++ b/cmd/download.go @@ -4,6 +4,7 @@ import ( "fmt" "log" "net/url" + "path/filepath" "strings" "time" @@ -62,17 +63,36 @@ var ( var downloadedPostsCount int dateFilterfunc := makeDateFilterFunc(beforeDate, afterDate) urls, err := extractor.GetAllPostsURLs(ctx, downloadUrl, dateFilterfunc) + urlsCount := len(urls) if err != nil { log.Fatalln(err) } + if urlsCount == 0 { + if verbose { + fmt.Println("No posts found, exiting...") + } + return + } if verbose { - fmt.Printf("Found %d posts\n", len(urls)) + fmt.Printf("Found %d posts\n", urlsCount) } if dryRun { - fmt.Printf("Found %d posts\n", len(urls)) + fmt.Printf("Found %d posts\n", urlsCount) fmt.Println("Dry run, exiting...") return } + urls, err = filterExistingPosts(urls, outputFolder, format) + if err != nil { + if verbose { + fmt.Println("Error filtering existing posts:", err) + } + } + if len(urls) == 0 { + if verbose { + fmt.Println("No new posts found, exiting...") + } + return + } bar := progressbar.NewOptions(len(urls), progressbar.OptionSetWidth(25), progressbar.OptionSetDescription("downloading"), @@ -154,3 +174,28 @@ func parseURL(toTest string) (*url.URL, error) { func makePath(post lib.Post, outputFolder string, format string) string { return fmt.Sprintf("%s/%s_%s.%s", outputFolder, convertDateTime(post.PostDate), post.Slug, format) } + +// extractSlug extracts the slug from a Substack post URL +// e.g. https://example.substack.com/p/this-is-the-post-title -> this-is-the-post-title +func extractSlug(url string) string { + split := strings.Split(url, "/") + return split[len(split)-1] +} + +// filterExistingPosts filters out posts that already exist in the output folder. +// It looks for files whose name ends with the post slug. +func filterExistingPosts(urls []string, outputFolder string, format string) ([]string, error) { + var filtered []string + for _, url := range urls { + slug := extractSlug(url) + path := fmt.Sprintf("%s/%s_%s.%s", outputFolder, "*", slug, format) + matches, err := filepath.Glob(path) + if err != nil { + return urls, err + } + if len(matches) == 0 { + filtered = append(filtered, url) + } + } + return filtered, nil +} diff --git a/cmd/version.go b/cmd/version.go index 54d0c8e..3e91d23 100644 --- a/cmd/version.go +++ b/cmd/version.go @@ -12,7 +12,7 @@ var versionCmd = &cobra.Command{ Short: "Print the version number of sbstck-dl", Long: `Display the current version of the app.`, Run: func(cmd *cobra.Command, args []string) { - fmt.Println("sbstck-dl v0.3.1") + fmt.Println("sbstck-dl v0.3.2") }, }