Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow import of newer pocket data export files in csv format #1023

Merged
merged 5 commits into from
Dec 11, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 144 additions & 43 deletions internal/cmd/pocket.go
Original file line number Diff line number Diff line change
@@ -1,22 +1,29 @@
package cmd

import (
"context"
"encoding/csv"
"errors"
"fmt"
"os"
"path/filepath"
"regexp"
"slices"
"strconv"
"strings"
"time"

"github.com/PuerkitoBio/goquery"
"github.com/go-shiori/shiori/internal/core"
"github.com/go-shiori/shiori/internal/database"
"github.com/go-shiori/shiori/internal/model"
"github.com/spf13/cobra"
)

func pocketCmd() *cobra.Command {
cmd := &cobra.Command{
Use: "pocket source-file",
Short: "Import bookmarks from Pocket's exported HTML file",
Short: "Import bookmarks from Pocket's data export file",
Args: cobra.ExactArgs(1),
Run: pocketHandler,
}
Expand All @@ -25,17 +32,43 @@ func pocketCmd() *cobra.Command {
}

func pocketHandler(cmd *cobra.Command, args []string) {
_, deps := initShiori(cmd.Context(), cmd)
ctx := cmd.Context()
_, deps := initShiori(ctx, cmd)

// Open pocket's file
srcFile, err := os.Open(args[0])
filePath := args[0]
srcFile, err := os.Open(filePath)
if err != nil {
cError.Println(err)
os.Exit(1)
}
defer srcFile.Close()

// Parse pocket's file
var bookmarks []model.BookmarkDTO
switch filepath.Ext(filePath) {
case ".html":
bookmarks = parseHtmlExport(ctx, deps.Database, srcFile)
case ".csv":
bookmarks = parseCsvExport(ctx, deps.Database, srcFile)
default:
cError.Println("Invalid file format. Only HTML and CSV are supported.")
os.Exit(1)
}

// Save bookmark to database
bookmarks, err = deps.Database.SaveBookmarks(ctx, true, bookmarks...)
if err != nil {
cError.Printf("Failed to save bookmarks: %v\n", err)
os.Exit(1)
}

// Print imported bookmarks
fmt.Println()
printBookmarks(bookmarks...)
}

// Parse bookmarks from HTML file
func parseHtmlExport(ctx context.Context, db database.DB, srcFile *os.File) []model.BookmarkDTO {
bookmarks := []model.BookmarkDTO{}
mapURL := make(map[string]struct{})

Expand All @@ -49,69 +82,137 @@ func pocketHandler(cmd *cobra.Command, args []string) {
// Get metadata
title := a.Text()
url, _ := a.Attr("href")
strTags, _ := a.Attr("tags")
strModified, _ := a.Attr("time_added")
intModified, _ := strconv.ParseInt(strModified, 10, 64)
modified := time.Unix(intModified, 0)

// Clean up URL
var err error
url, err = core.RemoveUTMParams(url)
tagsStr, _ := a.Attr("tags")
timeAddedStr, _ := a.Attr("time_added")

title, url, timeAdded, tags, err := verifyMetadata(title, url, timeAddedStr, tagsStr)
if err != nil {
cError.Printf("Skip %s: URL is not valid\n", url)
cError.Printf("Skip %s: %v\n", url, err)
return
}

// Make sure title is valid Utf-8
title = validateTitle(title, url)

// Check if the URL already exist before, both in bookmark
// file or in database
if _, exist := mapURL[url]; exist {
cError.Printf("Skip %s: URL already exists\n", url)
if err = handleDuplicates(ctx, db, mapURL, url); err != nil {
cError.Printf("Skip %s: %v\n", url, err)
return
}

_, exist, err := deps.Database.GetBookmark(cmd.Context(), 0, url)
if err != nil {
cError.Printf("Skip %s: Get Bookmark fail, %v", url, err)
return
// Add item to list
bookmark := model.BookmarkDTO{
URL: url,
Title: title,
ModifiedAt: timeAdded.Format(model.DatabaseDateFormat),
CreatedAt: timeAdded.Format(model.DatabaseDateFormat),
Tags: tags,
}

if exist {
cError.Printf("Skip %s: URL already exists\n", url)
mapURL[url] = struct{}{}
return
}
mapURL[url] = struct{}{}
bookmarks = append(bookmarks, bookmark)
})

return bookmarks
}

// Parse bookmarks from CSV file
func parseCsvExport(ctx context.Context, db database.DB, srcFile *os.File) []model.BookmarkDTO {
bookmarks := []model.BookmarkDTO{}
mapURL := make(map[string]struct{})

// Get bookmark tags
tags := []model.Tag{}
for _, strTag := range strings.Split(strTags, ",") {
if strTag != "" {
tags = append(tags, model.Tag{Name: strTag})
reader := csv.NewReader(srcFile)
records, err := reader.ReadAll()
if err != nil {
cError.Println(err)
os.Exit(1)
}

for i, cols := range records {
// Check and skip header
if i == 0 {
expected := []string{"title", "url", "time_added", "cursor", "tags", "status"}
if slices.Compare(cols, expected) != 0 {
cError.Printf("Invalid CSV format. Header must be: %s\n", strings.Join(expected, ","))
os.Exit(1)
}
continue
}

// Get metadata
title, url, timeAdded, tags, err := verifyMetadata(cols[0], cols[1], cols[2], cols[4])
if err != nil {
cError.Printf("Skip %s: %v\n", url, err)
continue
}

if err = handleDuplicates(ctx, db, mapURL, url); err != nil {
cError.Printf("Skip %s: %v\n", url, err)
continue
}

// Add item to list
bookmark := model.BookmarkDTO{
URL: url,
Title: title,
ModifiedAt: modified.Format(model.DatabaseDateFormat),
ModifiedAt: timeAdded.Format(model.DatabaseDateFormat),
CreatedAt: timeAdded.Format(model.DatabaseDateFormat),
Tags: tags,
}

mapURL[url] = struct{}{}
bookmarks = append(bookmarks, bookmark)
})
}

// Save bookmark to database
bookmarks, err = deps.Database.SaveBookmarks(cmd.Context(), true, bookmarks...)
return bookmarks
}

// Parse metadata and verify it's validity
func verifyMetadata(title, url, timeAddedStr, tags string) (string, string, time.Time, []model.Tag, error) {
// Clean up URL
var err error
url, err = core.RemoveUTMParams(url)
if err != nil {
cError.Printf("Failed to save bookmarks: %v\n", err)
os.Exit(1)
err = fmt.Errorf("URL is not valid, %w", err)
return "", "", time.Time{}, nil, err
}

// Print imported bookmark
fmt.Println()
printBookmarks(bookmarks...)
// Make sure title is valid Utf-8
title = validateTitle(title, url)

// Parse time added
timeAddedInt, err := strconv.ParseInt(timeAddedStr, 10, 64)
if err != nil {
err = fmt.Errorf("Invalid time added, %w", err)
return "", "", time.Time{}, nil, err
}
timeAdded := time.Unix(timeAddedInt, 0)

// Get bookmark tags
tagsList := []model.Tag{}
// We need to split tags by both comma or pipe,
// because Pocket's CSV export use pipe as separator,
// while HTML export use comma.
for _, tag := range regexp.MustCompile(`[,|]`).Split(tags, -1) {
if tag != "" {
tagsList = append(tagsList, model.Tag{Name: tag})
}
}

return title, url, timeAdded, tagsList, nil
}

// Checks if the URL already exist, both in bookmark
// file or in database
func handleDuplicates(ctx context.Context, db database.DB, mapURL map[string]struct{}, url string) error {
if _, exists := mapURL[url]; exists {
return errors.New("URL already exists")
}

_, exists, err := db.GetBookmark(ctx, 0, url)
if err != nil {
return fmt.Errorf("Failed getting bookmark, %w", err)
}

if exists {
return errors.New("URL already exists")
}

return nil
}
Loading