Skip to content

Commit

Permalink
Merge pull request #27 from tphoney/parallelise-amazon
Browse files Browse the repository at this point in the history
Parallelise amazon
  • Loading branch information
tphoney authored May 20, 2024
2 parents e57c8d7 + c1f8296 commit 926960a
Show file tree
Hide file tree
Showing 7 changed files with 275 additions and 266 deletions.
11 changes: 8 additions & 3 deletions TODO
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@

## features

- new release for amazon tv series
- allow amazon tv search for indivdual series
- new release for cinema-paradiso tv / movie

## bugs

- allow amazon tv search for indivdual series
- allow amazon tv search for newer series
- music, a-ha/ash doesnt match as an artist why ?
- move language filtering out of plex search,should only happens in web tv & movie
- move language filtering out of plex search, should only happen in web tv & movie web pages
- when scraping movies, do we stop at the first best match ?

## done

Expand All @@ -33,3 +36,5 @@
- parallelise cinema-paradiso movie search 6m20 to 2m25
- parallelise cinema-paradiso tv search
- for movies/tc dont refresh plex list every time, unless necessary
- parallelise amazon search tv/movie
- move newer show out of amazon and cinema-paradiso, move to web page
311 changes: 175 additions & 136 deletions amazon/amazon.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,183 +16,178 @@ import (
)

const (
amazonURL = "https://www.blu-ray.com/movies/search.php?keyword="
amazonURL = "https://www.blu-ray.com/movies/search.php?keyword="
LanguageGerman = "german"
)

func ScrapeTitles(searchResults *types.SearchResults) (scrapedResults []types.MovieSearchResult) {
var results, lookups []types.MovieSearchResult
for _, searchResult := range searchResults.MovieSearchResults {
if !searchResult.BestMatch {
results = append(results, searchResult)
} else {
lookups = append(lookups, searchResult)
}
}
var (
numberMoviesProcessed int = 0
numberTVProcessed int = 0
)

if len(lookups) > 0 {
ch := make(chan *types.MovieSearchResult, len(lookups))
// Limit number of concurrent requests
semaphore := make(chan struct{}, types.ConcurrencyLimit)
for i := range lookups {
go func() {
semaphore <- struct{}{}
defer func() { <-semaphore }()
scrapeTitle(&lookups[i], searchResults.PlexMovie.DateAdded, ch)
}()
}
func SearchAmazonMoviesInParallel(plexMovies []types.PlexMovie, language string) (searchResults []types.SearchResults) {
numberMoviesProcessed = 0
ch := make(chan types.SearchResults, len(plexMovies))
semaphore := make(chan struct{}, types.ConcurrencyLimit)

for i := range plexMovies {
go func(i int) {
semaphore <- struct{}{}
defer func() { <-semaphore }()
searchAmazonMovie(plexMovies[i], language, ch)
}(i)
}

for i := 0; i < len(lookups); i++ {
lookup := <-ch
results = append(results, *lookup)
}
searchResults = make([]types.SearchResults, 0, len(plexMovies))
for range plexMovies {
result := <-ch
searchResults = append(searchResults, result)
numberMoviesProcessed++
}
return results
numberMoviesProcessed = 0 // job is done
fmt.Println("amazon movies found:", len(searchResults))
return searchResults
}

func scrapeTitle(movie *types.MovieSearchResult, dateAdded time.Time, ch chan<- *types.MovieSearchResult) {
req, err := http.NewRequestWithContext(context.Background(), "GET", movie.URL, bytes.NewBuffer([]byte{}))
movie.ReleaseDate = time.Time{}
if err != nil {
fmt.Println("Error creating request:", err)
ch <- movie
return
func SearchAmazonTVInParallel(plexTVShows []types.PlexTVShow, language string) (searchResults []types.SearchResults) {
numberMoviesProcessed = 0
ch := make(chan types.SearchResults, len(plexTVShows))
semaphore := make(chan struct{}, types.ConcurrencyLimit)

for i := range plexTVShows {
go func(i int) {
semaphore <- struct{}{}
defer func() { <-semaphore }()
searchAmazonTV(&plexTVShows[i], language, ch)
}(i)
}

req.Header.Set("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")

client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
fmt.Println("Error sending request:", err)
ch <- movie
return
searchResults = make([]types.SearchResults, 0, len(plexTVShows))
for range plexTVShows {
result := <-ch
searchResults = append(searchResults, result)
numberTVProcessed++
}
numberTVProcessed = 0 // job is done
fmt.Println("amazon TV shows found:", len(searchResults))
return searchResults
}

defer resp.Body.Close()
func GetMovieJobProgress() int {
return numberMoviesProcessed
}

body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Println("Error reading response body:", err)
ch <- movie
return
func GetTVJobProgress() int {
return numberTVProcessed
}

func ScrapeTitlesParallel(searchResults []types.SearchResults) (scrapedResults []types.SearchResults) {
numberMoviesProcessed = 0
ch := make(chan types.SearchResults, len(searchResults))
semaphore := make(chan struct{}, types.ConcurrencyLimit)
for i := range searchResults {
go func(i int) {
semaphore <- struct{}{}
defer func() { <-semaphore }()
scrapeTitles(&searchResults[i], ch)
}(i)
}
rawData := string(body)
movie.ReleaseDate = findTitleDetails(rawData)
if movie.ReleaseDate.After(dateAdded) {
movie.NewRelease = true

scrapedResults = make([]types.SearchResults, 0, len(searchResults))
for range searchResults {
result := <-ch
scrapedResults = append(scrapedResults, result)
numberMoviesProcessed++
}
ch <- movie
numberMoviesProcessed = 0
fmt.Println("amazon Movie titles scraped:", len(scrapedResults))
return scrapedResults
}

func findTitleDetails(response string) (releaseDate time.Time) {
r := regexp.MustCompile(`<a class="grey noline" alt=".*">(.*?)</a></span>`)

match := r.FindStringSubmatch(response)
if match != nil {
stringDate := match[1]
var err error
releaseDate, err = time.Parse("Jan 02, 2006", stringDate)
func scrapeTitles(searchResult *types.SearchResults, ch chan<- types.SearchResults) {
dateAdded := searchResult.PlexMovie.DateAdded
for i := range searchResult.MovieSearchResults {
// this is to limit the number of requests
if !searchResult.MovieSearchResults[i].BestMatch {
continue
}
rawData, err := makeRequest(searchResult.MovieSearchResults[i].URL, "")
if err != nil {
releaseDate = time.Time{}
fmt.Println("scrapeTitle: Error making request:", err)
ch <- *searchResult
return
}
// Find the release date
searchResult.MovieSearchResults[i].ReleaseDate = time.Time{} // default to zero time
r := regexp.MustCompile(`<a class="grey noline" alt=".*">(.*?)</a></span>`)
match := r.FindStringSubmatch(rawData)
if match != nil {
stringDate := match[1]
searchResult.MovieSearchResults[i].ReleaseDate, _ = time.Parse("Jan 02, 2006", stringDate)
}
if searchResult.MovieSearchResults[i].ReleaseDate.After(dateAdded) {
searchResult.MovieSearchResults[i].NewRelease = true
}
} else {
releaseDate = time.Time{}
}

return releaseDate
ch <- *searchResult
}

func SearchAmazonMovie(plexMovie types.PlexMovie, filter string) (movieSearchResult types.SearchResults, err error) {
func searchAmazonMovie(plexMovie types.PlexMovie, language string, movieSearchResult chan<- types.SearchResults) {
result := types.SearchResults{}
result.PlexMovie = plexMovie
result.SearchURL = ""

urlEncodedTitle := url.QueryEscape(plexMovie.Title)
amazonURL := amazonURL + urlEncodedTitle
if filter != "" {
amazonURL += filter
// this searches for the movie in a language
switch language {
case LanguageGerman:
amazonURL += "&audio=" + language
default:
// do nothing
}
amazonURL += "&submit=Search&action=search"
req, err := http.NewRequestWithContext(context.Background(), "GET", amazonURL, bytes.NewBuffer([]byte{}))

movieSearchResult.PlexMovie = plexMovie
movieSearchResult.SearchURL = amazonURL

req.Header.Set("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
country := "uk"
if strings.Contains(filter, "german") {
country = "de"
}
req.Header.Set("Cookie", fmt.Sprintf("country=%s;", country))
if err != nil {
fmt.Println("Error creating request:", err)
return movieSearchResult, err
}

client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
fmt.Println("Error sending request:", err)
return movieSearchResult, err
}

defer resp.Body.Close()

body, err := io.ReadAll(resp.Body)
rawData, err := makeRequest(amazonURL, language)
if err != nil {
fmt.Println("Error reading response body:", err)
return movieSearchResult, err
fmt.Println("searchAmazonMovie: Error making request:", err)
movieSearchResult <- result
return
}
rawData := string(body)

moviesFound, _ := findTitlesInResponse(rawData, true)
movieSearchResult.MovieSearchResults = moviesFound
movieSearchResult = utils.MarkBestMatch(&movieSearchResult)
return movieSearchResult, nil
result.MovieSearchResults = moviesFound
result = utils.MarkBestMatch(&result)
movieSearchResult <- result
}

func SearchAmazonTV(plexTVShow *types.PlexTVShow, filter string) (tvSearchResult types.SearchResults, err error) {
func searchAmazonTV(plexTVShow *types.PlexTVShow, language string, tvSearchResult chan<- types.SearchResults) {
result := types.SearchResults{}
result.PlexTVShow = *plexTVShow
result.SearchURL = amazonURL

urlEncodedTitle := url.QueryEscape(fmt.Sprintf("%s complete series", plexTVShow.Title)) // complete series
amazonURL := amazonURL + urlEncodedTitle
if filter != "" {
amazonURL += filter
// this searches for the movie in a language
switch language {
case LanguageGerman:
amazonURL += "&audio=" + language
default:
// do nothing
}
amazonURL += "&submit=Search&action=search"
req, err := http.NewRequestWithContext(context.Background(), "GET", amazonURL, bytes.NewBuffer([]byte{}))

tvSearchResult.PlexTVShow = *plexTVShow
tvSearchResult.SearchURL = amazonURL

req.Header.Set("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
country := "uk"
if strings.Contains(filter, "german") {
country = "de"
}
req.Header.Set("Cookie", fmt.Sprintf("country=%s;", country))
if err != nil {
fmt.Println("Error creating request:", err)
return tvSearchResult, err
}

client := &http.Client{}
resp, err := client.Do(req)
rawData, err := makeRequest(amazonURL, language)
if err != nil {
fmt.Println("Error sending request:", err)
return tvSearchResult, err
}

defer resp.Body.Close()

body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Println("Error reading response body:", err)
return tvSearchResult, err
fmt.Println("searchAmazonTV: Error making request:", err)
tvSearchResult <- result
return
}
rawData := string(body)

_, titlesFound := findTitlesInResponse(rawData, false)
tvSearchResult.TVSearchResults = titlesFound
tvSearchResult = utils.MarkBestMatch(&tvSearchResult)
return tvSearchResult, nil
result.TVSearchResults = titlesFound
result = utils.MarkBestMatch(&result)
tvSearchResult <- result
}

func findTitlesInResponse(response string, movie bool) (movieResults []types.MovieSearchResult, tvResults []types.TVSearchResult) {
Expand Down Expand Up @@ -256,3 +251,47 @@ func findTitlesInResponse(response string, movie bool) (movieResults []types.Mov

return movieResults, tvResults
}

func makeRequest(inputURL, language string) (response string, err error) {
req, err := http.NewRequestWithContext(context.Background(), "GET", inputURL, bytes.NewBuffer([]byte{}))

req.Header.Set("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")

// this forces results from a specific amazon region
switch language {
case LanguageGerman:
req.Header.Set("Cookie", "country=de;")
default:
req.Header.Set("Cookie", "country=uk;")
}

if err != nil {
fmt.Println("makeRequest: error creating request:", err)
return response, err
}

client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
fmt.Println("makeRequest: error sending request:", err)
return response, err
}

defer resp.Body.Close()

body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Println("makeRequest: error reading response body:", err)
return response, err
}

// check for a 200 status code
if resp.StatusCode != http.StatusOK {
fmt.Println("amazon: status code not OK, probably rate limited:", resp.StatusCode)
return response, fmt.Errorf("amazon: status code not OK: %d", resp.StatusCode)
}

rawResponse := string(body)
return rawResponse, nil
}
Loading

0 comments on commit 926960a

Please sign in to comment.