From 5a0baaf91e26d9322e18185075509564d6add35a Mon Sep 17 00:00:00 2001
From: Alex Ferrari <alex@thealexferrari.com>
Date: Tue, 23 Jan 2024 13:37:54 +0100
Subject: [PATCH] Implement download resume

---
 README.md       |  6 +++++-
 cmd/download.go | 49 +++++++++++++++++++++++++++++++++++++++++++++++--
 cmd/version.go  |  2 +-
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 9916f36..ae63302 100644
--- a/README.md
+++ b/README.md
@@ -44,6 +44,10 @@ Use "sbstck-dl [command] --help" for more information about a command.
 
 You can provide the url of a single post or the main url of the Substack you want to download.
 
+By providing the main URL of a Substack, the downloader will download all the posts of the archive.
+
+When downloading the full archive, if the downloader is interrupted, at the next execution it will resume the download of the remaining posts.
+
 ```bash
 Usage:
   sbstck-dl download [flags]
@@ -104,7 +108,6 @@ sbstck-dl download --url https://example.substack.com --cookie_name substack.sid
 
 ## TODO
 
-- [ ] Implementing resuming downloads
 - [ ] Improve retry logic
 - [ ] Implement loading from config file
 - [ ] Add support for downloading media
@@ -113,3 +116,4 @@ sbstck-dl download --url https://example.substack.com --cookie_name substack.sid
 - [x] Add documentation
 - [x] Add support for private newsletters
 - [x] Implement filtering by date
+- [x] Implement resuming downloads
diff --git a/cmd/download.go b/cmd/download.go
index beab8ec..fa210df 100644
--- a/cmd/download.go
+++ b/cmd/download.go
@@ -4,6 +4,7 @@ import (
 	"fmt"
 	"log"
 	"net/url"
+	"path/filepath"
 	"strings"
 	"time"
 
@@ -62,17 +63,36 @@ var (
 				var downloadedPostsCount int
 				dateFilterfunc := makeDateFilterFunc(beforeDate, afterDate)
 				urls, err := extractor.GetAllPostsURLs(ctx, downloadUrl, dateFilterfunc)
+				urlsCount := len(urls)
 				if err != nil {
 					log.Fatalln(err)
 				}
+				if urlsCount == 0 {
+					if verbose {
+						fmt.Println("No posts found, exiting...")
+					}
+					return
+				}
 				if verbose {
-					fmt.Printf("Found %d posts\n", len(urls))
+					fmt.Printf("Found %d posts\n", urlsCount)
 				}
 				if dryRun {
-					fmt.Printf("Found %d posts\n", len(urls))
+					fmt.Printf("Found %d posts\n", urlsCount)
 					fmt.Println("Dry run, exiting...")
 					return
 				}
+				urls, err = filterExistingPosts(urls, outputFolder, format)
+				if err != nil {
+					if verbose {
+						fmt.Println("Error filtering existing posts:", err)
+					}
+				}
+				if len(urls) == 0 {
+					if verbose {
+						fmt.Println("No new posts found, exiting...")
+					}
+					return
+				}
 				bar := progressbar.NewOptions(len(urls),
 					progressbar.OptionSetWidth(25),
 					progressbar.OptionSetDescription("downloading"),
@@ -154,3 +174,28 @@ func parseURL(toTest string) (*url.URL, error) {
 func makePath(post lib.Post, outputFolder string, format string) string {
 	return fmt.Sprintf("%s/%s_%s.%s", outputFolder, convertDateTime(post.PostDate), post.Slug, format)
 }
+
+// extractSlug extracts the slug from a Substack post URL
+// e.g. https://example.substack.com/p/this-is-the-post-title -> this-is-the-post-title
+func extractSlug(url string) string {
+	split := strings.Split(url, "/")
+	return split[len(split)-1]
+}
+
+// filterExistingPosts filters out posts that already exist in the output folder.
+// It looks for files whose name ends with the post slug.
+func filterExistingPosts(urls []string, outputFolder string, format string) ([]string, error) {
+	var filtered []string
+	for _, url := range urls {
+		slug := extractSlug(url)
+		path := fmt.Sprintf("%s/%s_%s.%s", outputFolder, "*", slug, format)
+		matches, err := filepath.Glob(path)
+		if err != nil {
+			return urls, err
+		}
+		if len(matches) == 0 {
+			filtered = append(filtered, url)
+		}
+	}
+	return filtered, nil
+}
diff --git a/cmd/version.go b/cmd/version.go
index 54d0c8e..3e91d23 100644
--- a/cmd/version.go
+++ b/cmd/version.go
@@ -12,7 +12,7 @@ var versionCmd = &cobra.Command{
 	Short: "Print the version number of sbstck-dl",
 	Long:  `Display the current version of the app.`,
 	Run: func(cmd *cobra.Command, args []string) {
-		fmt.Println("sbstck-dl v0.3.1")
+		fmt.Println("sbstck-dl v0.3.2")
 	},
 }