diff --git a/src/lib/rt.ts b/src/lib/rt.ts index 13d82d5..92a4af3 100644 --- a/src/lib/rt.ts +++ b/src/lib/rt.ts @@ -63,9 +63,10 @@ export async function scrapeRottenTomatoesInfoByUrlImpl( movie.genres = schema.genre } - if (schema.url) { - movie.rtUrl = schema.url - } + // TODO + // if (schema.url) { + // movie.rtUrl = schema.url + // } if (schema.description) { movie.plot = schema.description diff --git a/src/lib/wikidata.ts b/src/lib/wikidata.ts index 89c71f1..4e56b0a 100644 --- a/src/lib/wikidata.ts +++ b/src/lib/wikidata.ts @@ -115,9 +115,12 @@ export async function fetchAllWikidataMovies({ } offset += limit + + // this approach has a hard offset limit of 10k imposed by wikidata // // find all films which have both an IMDB id and a rotten tomatoes id // const url = wdk.cirrusSearchPages({ // search: query, + // // TODO: this doesn't support children of Q11424 films (like unfinished films) // haswbstatement: ['P31=Q11424', 'P345', 'P1258'], // limit, // offset diff --git a/src/populate-imdb-movies.ts b/src/populate-imdb-movies.ts index 8f661fb..271dd27 100644 --- a/src/populate-imdb-movies.ts +++ b/src/populate-imdb-movies.ts @@ -156,19 +156,20 @@ async function main() { console.log() console.log(`batch ${batchNum} done`, { numMovies, - numIMDBMoviesDownloaded + numIMDBMoviesDownloaded, + numIMDBMoviesDownloadedTotal }) ++batchNum } while (batchNum < numBatches) + await imdbMoviesDb.close() + console.log() console.log('done', { numMoviesTotal, numIMDBMoviesDownloadedTotal }) - - await imdbMoviesDb.close() } main() diff --git a/src/populate-rt-movies.ts b/src/populate-rt-movies.ts index 75ee710..806ccc1 100644 --- a/src/populate-rt-movies.ts +++ b/src/populate-rt-movies.ts @@ -61,26 +61,29 @@ async function main() { return null } - const rtUrls = Array.from( - new Set( - [ - movie.rtUrl, - rtMovies[movie.tmdbId]?.rtUrl, - movie.imdbId && wikidataMovies[movie.imdbId]?.rtUrl, - movie.imdbId && omdbMovies[movie.imdbId]?.tomatoURL - ] - .filter(Boolean) - .map((url) => url.trim().replace(/\/+$/g, '').trim()) - ) - ) - - // console.log( - // `${batchNum}:${index}`, - // movie.tmdbId, - // movie.title, - // 'rtUrls', - // Array.from(rtUrls) - // ) + const tempUrls = [ + movie.rtUrl, + rtMovies[movie.tmdbId]?.rtUrl, + movie.imdbId && wikidataMovies[movie.imdbId]?.rtUrl, + movie.imdbId && omdbMovies[movie.imdbId]?.tomatoURL + ] + .filter(Boolean) + .map((url) => url.trim().replace(/\/+$/g, '').trim()) + + const rtUrlsTemp = new Set() + const rtUrls: string[] = [] + for (const tempUrl of tempUrls) { + if (!rtUrlsTemp.has(tempUrl)) { + rtUrlsTemp.add(tempUrl) + rtUrls.push(tempUrl) + } + } + + if (rtUrls.length > 1) { + console.log(`${batchNum}:${index}`, movie.tmdbId, movie.title, { + rtUrls + }) + } let numErrors = 0