From b08918512056db914542d6af7d45cb12a4db4346 Mon Sep 17 00:00:00 2001 From: ikprk Date: Mon, 24 Jun 2024 13:02:14 +0200 Subject: [PATCH 01/13] New model to track orion language processing --- src/model/OrionOffchainCursor.ts | 20 ++++++++++++++++++++ src/model/index.ts | 1 + 2 files changed, 21 insertions(+) create mode 100644 src/model/OrionOffchainCursor.ts diff --git a/src/model/OrionOffchainCursor.ts b/src/model/OrionOffchainCursor.ts new file mode 100644 index 000000000..93d8bfa4d --- /dev/null +++ b/src/model/OrionOffchainCursor.ts @@ -0,0 +1,20 @@ +import { Entity, Column, PrimaryColumn } from 'typeorm' + +@Entity() +export class OrionOffchainCursor { + constructor(props?: Partial) { + Object.assign(this, props) + } + + /** + * Name of the offchain cursor + */ + @PrimaryColumn() + cursorName!: string + + /** + * Value of the cursor + */ + @Column('int8', { nullable: false }) + value!: number +} diff --git a/src/model/index.ts b/src/model/index.ts index 7ebc7a5a7..21c96308b 100644 --- a/src/model/index.ts +++ b/src/model/index.ts @@ -1,2 +1,3 @@ export * from './generated' export { NextEntityId } from './NextEntityId' +export { OrionOffchainCursor } from './OrionOffchainCursor' From 23d8f113ce03a98663738e319812910348c63d4c Mon Sep 17 00:00:00 2001 From: ikprk Date: Mon, 24 Jun 2024 13:02:40 +0200 Subject: [PATCH 02/13] New function to detect language --- .env | 3 +++ package-lock.json | 17 +++++++++++++++++ package.json | 1 + src/utils/language.ts | 10 ++++++++++ 4 files changed, 31 insertions(+) diff --git a/.env b/.env index 3cc110bab..c916afc28 100644 --- a/.env +++ b/.env @@ -62,6 +62,9 @@ TRUST_PROXY=uniquelocal SENDGRID_API_KEY= SENDGRID_FROM_EMAIL=gateway@example.com +# Detectlanguage +DETECTLANGUAGE_API_KEY= + # Debug settings SQD_DEBUG=api:* diff --git a/package-lock.json b/package-lock.json index d4b2b482c..660052322 100644 --- a/package-lock.json +++ b/package-lock.json @@ -40,6 +40,7 @@ "cookie-parser": "^1.4.6", "csv-stringify": "^6.3.0", "dayjs": "^1.11.7", + "detectlanguage": "^2.1.0", "dotenv": "^16.0.3", "dotenv-expand": "^10.0.0", "express-openapi-validator": "^5.0.3", @@ -13889,6 +13890,22 @@ "integrity": "sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g==", "dev": true }, + "node_modules/detectlanguage": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/detectlanguage/-/detectlanguage-2.1.0.tgz", + "integrity": "sha512-EbLGyZxiQJeur5a+GNOzBV9xL/r/7GfvRALSHAKepw38UAvCssn7obVvhsioRIV+uDj3IQtXzL7iNkwu0oCp7g==", + "dependencies": { + "axios": "^0.21.1" + } + }, + "node_modules/detectlanguage/node_modules/axios": { + "version": "0.21.4", + "resolved": "https://registry.npmjs.org/axios/-/axios-0.21.4.tgz", + "integrity": "sha512-ut5vewkiu8jjGBdqpM44XxjuCjq9LAKeHVmoVfHVzy8eHgxxq8SbAVQNovDA8mVi05kP0Ea/n/UzcSHcTJQfNg==", + "dependencies": { + "follow-redirects": "^1.14.0" + } + }, "node_modules/dezalgo": { "version": "1.0.4", "resolved": "https://registry.npmjs.org/dezalgo/-/dezalgo-1.0.4.tgz", diff --git a/package.json b/package.json index 61462ea6c..fc40befab 100644 --- a/package.json +++ b/package.json @@ -72,6 +72,7 @@ "cookie-parser": "^1.4.6", "csv-stringify": "^6.3.0", "dayjs": "^1.11.7", + "detectlanguage": "^2.1.0", "dotenv": "^16.0.3", "dotenv-expand": "^10.0.0", "express-openapi-validator": "^5.0.3", diff --git a/src/utils/language.ts b/src/utils/language.ts index a9f8c2b45..4b74978ef 100644 --- a/src/utils/language.ts +++ b/src/utils/language.ts @@ -1,4 +1,9 @@ import { detectAll } from 'tinyld' +import DetectLanguage from 'detectlanguage' + +const languageDetectionApiKey = process.env.DETECTLANGUAGE_API_KEY + +const languageDetectionInstace = new DetectLanguage(languageDetectionApiKey ?? '') function cleanString(input: string): string { // First, remove URLs. This pattern targets a broad range of URLs. @@ -17,6 +22,11 @@ function predictLanguage(text: string): { lang: string; accuracy: number } | und return detectAll(cleanedText)?.[0] } +export async function predictLanguageForArray(texts: string[]) { + const result = await languageDetectionInstace.detect(texts) + return result.map((row) => row[0].language) +} + export function predictVideoLanguage({ title, description }: any): string | undefined { let detectedLang: string | undefined From 54e2844322a4e3a7baf893c5d4aec9c6af874207 Mon Sep 17 00:00:00 2001 From: ikprk Date: Mon, 24 Jun 2024 13:03:05 +0200 Subject: [PATCH 03/13] New custom migration --- .../setOrionLanguageProvider.ts | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 src/utils/customMigrations/setOrionLanguageProvider.ts diff --git a/src/utils/customMigrations/setOrionLanguageProvider.ts b/src/utils/customMigrations/setOrionLanguageProvider.ts new file mode 100644 index 000000000..96798b216 --- /dev/null +++ b/src/utils/customMigrations/setOrionLanguageProvider.ts @@ -0,0 +1,73 @@ +import { EntityManager } from 'typeorm' +import { OrionOffchainCursor } from '../../model' +import { globalEm } from '../globalEm' +import { predictLanguageForArray } from '../language' + +const batchSize = 5 // Adjust the batch size based on your database and network performance + +let rowAffected = 0 + +const VIDEO_ORION_LANGUAGE_CURSOR_NAME = 'video_orion_language' + +async function detectVideoLanguageWithProvider() { + const em: EntityManager = await globalEm + const cursorEntity: { value: string }[] = await em.query( + `SELECT value FROM orion_offchain_cursor WHERE cursor_name='${VIDEO_ORION_LANGUAGE_CURSOR_NAME}'` + ) + const cursor = +(cursorEntity[0]?.value ?? 0) + + const videos: { id: string; title: string; description: string }[] = await em.query(` + SELECT id, title, description + FROM admin.video + ORDER BY id::INTEGER ASC + OFFSET ${cursor} + LIMIT ${batchSize} + `) + + if (!videos.length) { + console.log('No more videos!') + return + } + + const mappedVideos = videos.map((video) => `${video.title} ${video.description}`) + + const predictionForVideos = await predictLanguageForArray(mappedVideos) + + const videosWithDetections = videos.map((video, index) => ({ + ...video, + detectedLanguage: predictionForVideos[index], + })) + + const query = ` + UPDATE admin.video AS v SET + orion_language = c.orion_language + FROM (VALUES ${videosWithDetections + .map((_, idx) => `($${idx * 2 + 1}, $${idx * 2 + 2})`) + .join(',')}) AS c(orion_language, id) + WHERE c.id = v.id; + ` + + const queryParams = videosWithDetections.flatMap((update) => [update.detectedLanguage, update.id]) + + // Execute batch update + await em.query(query, queryParams) + const newCursor = new OrionOffchainCursor({ + cursorName: VIDEO_ORION_LANGUAGE_CURSOR_NAME, + value: cursor + Math.min(batchSize, videos.length), + }) + await em.save(newCursor) + console.log( + `Updated languages for videos in range ${cursor}-${cursor + Math.min(batchSize, videos.length)}` + ) + + rowAffected += videos.length + + await detectVideoLanguageWithProvider() +} + +detectVideoLanguageWithProvider() + .then(() => console.log(`Update process completed. Rows affected ${rowAffected}`)) + .catch((e) => { + console.error('process failed', e) + process.exit(1) + }) From 06b09c1aef7c97ac684325eaa40c2465f58ae819 Mon Sep 17 00:00:00 2001 From: ikprk Date: Mon, 24 Jun 2024 13:03:36 +0200 Subject: [PATCH 04/13] Add cursor tracker and video orion language to offchain export --- src/utils/offchainState.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/utils/offchainState.ts b/src/utils/offchainState.ts index 7360ed2c3..353e19322 100644 --- a/src/utils/offchainState.ts +++ b/src/utils/offchainState.ts @@ -60,8 +60,9 @@ const exportedStateMap: ExportedStateMap = { EmailDeliveryAttempt: true, Token: true, NextEntityId: true, + OrionOffchainCursor: true, Channel: ['isExcluded', 'videoViewsNum', 'followsNum', 'yppStatus', 'channelWeight'], - Video: ['isExcluded', 'viewsNum'], + Video: ['isExcluded', 'viewsNum', 'orionLanguage'], Comment: ['isExcluded'], OwnedNft: ['isFeatured'], VideoCategory: ['isSupported'], From 71c4997aa17fbd6b4952ee4c7b935a1c7ce26e02 Mon Sep 17 00:00:00 2001 From: ikprk Date: Mon, 24 Jun 2024 13:09:54 +0200 Subject: [PATCH 05/13] Running `custom-migration` command --- db/migrations/1719227204374-Data.js | 17 +++++++++++++++++ ...41962433-Views.js => 1719227204486-Views.js} | 4 ++-- 2 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 db/migrations/1719227204374-Data.js rename db/migrations/{1709641962433-Views.js => 1719227204486-Views.js} (91%) diff --git a/db/migrations/1719227204374-Data.js b/db/migrations/1719227204374-Data.js new file mode 100644 index 000000000..6db16e689 --- /dev/null +++ b/db/migrations/1719227204374-Data.js @@ -0,0 +1,17 @@ +module.exports = class Data1719227204374 { + name = 'Data1719227204374' + + async up(db) { + await db.query(`CREATE TABLE "marketplace_token" ("liquidity" integer, "market_cap" numeric, "cumulative_revenue" numeric, "amm_volume" numeric, "last_day_price_change" numeric, "weekly_liq_change" numeric, "id" character varying NOT NULL, "status" character varying(6) NOT NULL, "avatar" jsonb, "total_supply" numeric NOT NULL, "is_featured" boolean NOT NULL, "symbol" text, "is_invite_only" boolean NOT NULL, "annual_creator_reward_permill" integer NOT NULL, "revenue_share_ratio_permill" integer NOT NULL, "created_at" TIMESTAMP WITH TIME ZONE NOT NULL, "channel_id" text, "description" text, "whitelist_applicant_note" text, "whitelist_applicant_link" text, "accounts_num" integer NOT NULL, "number_of_revenue_share_activations" integer NOT NULL, "deissued" boolean NOT NULL, "current_amm_sale_id" text, "current_sale_id" text, "current_revenue_share_id" text, "number_of_vested_transfer_issued" integer NOT NULL, "last_price" numeric, CONSTRAINT "PK_d836a8c3d907b67099c140c4d84" PRIMARY KEY ("id"))`) + await db.query(`CREATE INDEX "IDX_1268fd020cf195b2e8d5d85093" ON "marketplace_token" ("symbol") `) + await db.query(`CREATE INDEX "IDX_b99bb1ecee77f23016f6ef687c" ON "marketplace_token" ("created_at") `) + await db.query(`CREATE TABLE "orion_offchain_cursor" ("cursor_name" character varying NOT NULL, "value" bigint NOT NULL, CONSTRAINT "PK_7083797352af5a21224b6c8ccbc" PRIMARY KEY ("cursor_name"))`) + } + + async down(db) { + await db.query(`DROP TABLE "marketplace_token"`) + await db.query(`DROP INDEX "public"."IDX_1268fd020cf195b2e8d5d85093"`) + await db.query(`DROP INDEX "public"."IDX_b99bb1ecee77f23016f6ef687c"`) + await db.query(`DROP TABLE "orion_offchain_cursor"`) + } +} diff --git a/db/migrations/1709641962433-Views.js b/db/migrations/1719227204486-Views.js similarity index 91% rename from db/migrations/1709641962433-Views.js rename to db/migrations/1719227204486-Views.js index 247ab4dca..893cdf4d7 100644 --- a/db/migrations/1709641962433-Views.js +++ b/db/migrations/1719227204486-Views.js @@ -1,8 +1,8 @@ const { getViewDefinitions } = require('../viewDefinitions') -module.exports = class Views1709641962433 { - name = 'Views1709641962433' +module.exports = class Views1719227204486 { + name = 'Views1719227204486' async up(db) { const viewDefinitions = getViewDefinitions(db); From 4f49891ae14736f5a19f98757e3f8fce08739c96 Mon Sep 17 00:00:00 2001 From: ikprk Date: Mon, 24 Jun 2024 13:40:38 +0200 Subject: [PATCH 06/13] Add manger to trigger video language updates --- src/mappings/utils.ts | 6 ++++ src/utils/OrionVideoLanguageManager.ts | 34 +++++++++++++++++++ .../setOrionLanguageProvider.ts | 17 ++-------- 3 files changed, 43 insertions(+), 14 deletions(-) create mode 100644 src/utils/OrionVideoLanguageManager.ts diff --git a/src/mappings/utils.ts b/src/mappings/utils.ts index 49d1f8877..ccc9f4d65 100644 --- a/src/mappings/utils.ts +++ b/src/mappings/utils.ts @@ -10,10 +10,16 @@ import { Event, MetaprotocolTransactionResultFailed, NftActivity, NftHistoryEntr import { CommentCountersManager } from '../utils/CommentsCountersManager' import { VideoRelevanceManager } from '../utils/VideoRelevanceManager' import { EntityManagerOverlay } from '../utils/overlay' +import { OrionVideoLanguageManager } from '../utils/OrionVideoLanguageManager' +const orionVideoLanguageManager = new OrionVideoLanguageManager() export const commentCountersManager = new CommentCountersManager() export const videoRelevanceManager = new VideoRelevanceManager() // eslint-disable-next-line no-void +void orionVideoLanguageManager.init( + 1000 * 60 * 5 // 5 mins +) +// eslint-disable-next-line no-void void videoRelevanceManager.init({ fullUpdateLoopTime: 1000 * 60 * 60 * 12, // 12 hrs scheduledUpdateLoopTime: 1000 * 60 * 10, // 10 mins diff --git a/src/utils/OrionVideoLanguageManager.ts b/src/utils/OrionVideoLanguageManager.ts new file mode 100644 index 000000000..33e3c2f8f --- /dev/null +++ b/src/utils/OrionVideoLanguageManager.ts @@ -0,0 +1,34 @@ +import { + detectVideoLanguageWithProvider, + VIDEO_ORION_LANGUAGE_CURSOR_NAME, +} from './customMigrations/setOrionLanguageProvider' + +export class OrionVideoLanguageManager { + async init(intervalMs: number): Promise { + if (!VIDEO_ORION_LANGUAGE_CURSOR_NAME) { + return + } + + this.updateLoop(intervalMs) + .then(() => { + /* Do nothing */ + }) + .catch((err) => { + console.error(err) + process.exit(-1) + }) + } + + async updateOrionVideoLanguage() { + return detectVideoLanguageWithProvider() + } + + private async updateLoop(intervalMs: number): Promise { + while (true) { + await this.updateOrionVideoLanguage().catch((e) => { + console.log(`Updating Orion language with provider failed`, e) + }) + await new Promise((resolve) => setTimeout(resolve, intervalMs)) + } + } +} diff --git a/src/utils/customMigrations/setOrionLanguageProvider.ts b/src/utils/customMigrations/setOrionLanguageProvider.ts index 96798b216..13114fb3a 100644 --- a/src/utils/customMigrations/setOrionLanguageProvider.ts +++ b/src/utils/customMigrations/setOrionLanguageProvider.ts @@ -3,13 +3,11 @@ import { OrionOffchainCursor } from '../../model' import { globalEm } from '../globalEm' import { predictLanguageForArray } from '../language' -const batchSize = 5 // Adjust the batch size based on your database and network performance +const batchSize = 5_000 // Adjust the batch size based on your database and network performance -let rowAffected = 0 +export const VIDEO_ORION_LANGUAGE_CURSOR_NAME = 'video_orion_language' -const VIDEO_ORION_LANGUAGE_CURSOR_NAME = 'video_orion_language' - -async function detectVideoLanguageWithProvider() { +export async function detectVideoLanguageWithProvider() { const em: EntityManager = await globalEm const cursorEntity: { value: string }[] = await em.query( `SELECT value FROM orion_offchain_cursor WHERE cursor_name='${VIDEO_ORION_LANGUAGE_CURSOR_NAME}'` @@ -60,14 +58,5 @@ async function detectVideoLanguageWithProvider() { `Updated languages for videos in range ${cursor}-${cursor + Math.min(batchSize, videos.length)}` ) - rowAffected += videos.length - await detectVideoLanguageWithProvider() } - -detectVideoLanguageWithProvider() - .then(() => console.log(`Update process completed. Rows affected ${rowAffected}`)) - .catch((e) => { - console.error('process failed', e) - process.exit(1) - }) From 72c3f64ddedb951c920f04c163bfa47c0357f96a Mon Sep 17 00:00:00 2001 From: ikprk Date: Mon, 24 Jun 2024 14:38:27 +0200 Subject: [PATCH 07/13] Adjust orion video language manager to support video update --- src/mappings/content/video.ts | 25 +++++++--- src/mappings/utils.ts | 2 +- src/utils/OrionVideoLanguageManager.ts | 28 +++++++++++ .../setOrionLanguageProvider.ts | 49 +++++++++++-------- 4 files changed, 75 insertions(+), 29 deletions(-) diff --git a/src/mappings/content/video.ts b/src/mappings/content/video.ts index 8589f202e..d3bde7d7d 100644 --- a/src/mappings/content/video.ts +++ b/src/mappings/content/video.ts @@ -16,11 +16,14 @@ import { VideoPosted, VideoViewEvent, } from '../../model' +import { VIDEO_ORION_LANGUAGE_CURSOR_NAME } from '../../utils/customMigrations/setOrionLanguageProvider' import { EventHandlerContext } from '../../utils/events' import { predictVideoLanguage } from '../../utils/language' +import { OrionVideoLanguageManager } from '../../utils/OrionVideoLanguageManager' import { deserializeMetadata, genericEventFields, + orionVideoLanguageManager, u8aToBytes, videoRelevanceManager, } from '../utils' @@ -122,10 +125,12 @@ export async function processVideoCreatedEvent({ } } - video.orionLanguage = predictVideoLanguage({ - title: video.title ?? '', - description: video.description ?? '', - }) + video.orionLanguage = VIDEO_ORION_LANGUAGE_CURSOR_NAME + ? null + : predictVideoLanguage({ + title: video.title ?? '', + description: video.description ?? '', + }) channel.totalVideosCreated += 1 @@ -192,10 +197,14 @@ export async function processVideoUpdatedEvent({ ) } - video.orionLanguage = predictVideoLanguage({ - title: video.title ?? '', - description: video.description ?? '', - }) + if (VIDEO_ORION_LANGUAGE_CURSOR_NAME) { + orionVideoLanguageManager.scheduleVideoForDetection(video.id) + } else { + video.orionLanguage = predictVideoLanguage({ + title: video.title ?? '', + description: video.description ?? '', + }) + } if (autoIssueNft) { await processNft(overlay, block, indexInBlock, extrinsicHash, video, contentActor, autoIssueNft) diff --git a/src/mappings/utils.ts b/src/mappings/utils.ts index ccc9f4d65..4f2da75f3 100644 --- a/src/mappings/utils.ts +++ b/src/mappings/utils.ts @@ -12,7 +12,7 @@ import { VideoRelevanceManager } from '../utils/VideoRelevanceManager' import { EntityManagerOverlay } from '../utils/overlay' import { OrionVideoLanguageManager } from '../utils/OrionVideoLanguageManager' -const orionVideoLanguageManager = new OrionVideoLanguageManager() +export const orionVideoLanguageManager = new OrionVideoLanguageManager() export const commentCountersManager = new CommentCountersManager() export const videoRelevanceManager = new VideoRelevanceManager() // eslint-disable-next-line no-void diff --git a/src/utils/OrionVideoLanguageManager.ts b/src/utils/OrionVideoLanguageManager.ts index 33e3c2f8f..d177cb44c 100644 --- a/src/utils/OrionVideoLanguageManager.ts +++ b/src/utils/OrionVideoLanguageManager.ts @@ -1,9 +1,14 @@ +import { EntityManager } from 'typeorm' import { detectVideoLanguageWithProvider, + updateVideoLanguages, VIDEO_ORION_LANGUAGE_CURSOR_NAME, } from './customMigrations/setOrionLanguageProvider' +import { globalEm } from './globalEm' export class OrionVideoLanguageManager { + private videoToDetect: Set = new Set() + async init(intervalMs: number): Promise { if (!VIDEO_ORION_LANGUAGE_CURSOR_NAME) { return @@ -19,12 +24,35 @@ export class OrionVideoLanguageManager { }) } + scheduleVideoForDetection(id: string | null | undefined) { + if (id) { + this.videoToDetect.add(id) + } + } + + async updateScheduledVideoLanguage(em: EntityManager) { + if (!this.videoToDetect.size) { + return + } + + const videos = await em.query(` + SELECT id, title, description + FROM admin.video + WHERE id in (${[...this.videoToDetect.values()].map((id) => `'${id}'`).join(',')}) + `) + + await updateVideoLanguages(em, videos) + this.videoToDetect.clear() + } + async updateOrionVideoLanguage() { return detectVideoLanguageWithProvider() } private async updateLoop(intervalMs: number): Promise { + const em = await globalEm while (true) { + await this.updateScheduledVideoLanguage(em) await this.updateOrionVideoLanguage().catch((e) => { console.log(`Updating Orion language with provider failed`, e) }) diff --git a/src/utils/customMigrations/setOrionLanguageProvider.ts b/src/utils/customMigrations/setOrionLanguageProvider.ts index 13114fb3a..94f2b959e 100644 --- a/src/utils/customMigrations/setOrionLanguageProvider.ts +++ b/src/utils/customMigrations/setOrionLanguageProvider.ts @@ -5,28 +5,15 @@ import { predictLanguageForArray } from '../language' const batchSize = 5_000 // Adjust the batch size based on your database and network performance -export const VIDEO_ORION_LANGUAGE_CURSOR_NAME = 'video_orion_language' - -export async function detectVideoLanguageWithProvider() { - const em: EntityManager = await globalEm - const cursorEntity: { value: string }[] = await em.query( - `SELECT value FROM orion_offchain_cursor WHERE cursor_name='${VIDEO_ORION_LANGUAGE_CURSOR_NAME}'` - ) - const cursor = +(cursorEntity[0]?.value ?? 0) - - const videos: { id: string; title: string; description: string }[] = await em.query(` - SELECT id, title, description - FROM admin.video - ORDER BY id::INTEGER ASC - OFFSET ${cursor} - LIMIT ${batchSize} - `) +type VideoUpdateType = { + id: string + title: string + description: string +} - if (!videos.length) { - console.log('No more videos!') - return - } +export const VIDEO_ORION_LANGUAGE_CURSOR_NAME = 'video_orion_language' +export async function updateVideoLanguages(em: EntityManager, videos: VideoUpdateType[]) { const mappedVideos = videos.map((video) => `${video.title} ${video.description}`) const predictionForVideos = await predictLanguageForArray(mappedVideos) @@ -49,6 +36,28 @@ export async function detectVideoLanguageWithProvider() { // Execute batch update await em.query(query, queryParams) +} + +export async function detectVideoLanguageWithProvider() { + const em: EntityManager = await globalEm + const cursorEntity: { value: string }[] = await em.query( + `SELECT value FROM orion_offchain_cursor WHERE cursor_name='${VIDEO_ORION_LANGUAGE_CURSOR_NAME}'` + ) + const cursor = +(cursorEntity[0]?.value ?? 0) + + const videos: VideoUpdateType[] = await em.query(` + SELECT id, title, description + FROM admin.video + ORDER BY id::INTEGER ASC + OFFSET ${cursor} + LIMIT ${batchSize} + `) + + if (!videos.length) { + console.log('No more videos!') + return + } + await updateVideoLanguages(em, videos) const newCursor = new OrionOffchainCursor({ cursorName: VIDEO_ORION_LANGUAGE_CURSOR_NAME, value: cursor + Math.min(batchSize, videos.length), From 32d7c5b0bd4bc519b809bf0512cc2ef86e98aec9 Mon Sep 17 00:00:00 2001 From: ikprk Date: Mon, 24 Jun 2024 14:47:35 +0200 Subject: [PATCH 08/13] Revert "Running `custom-migration` command" This reverts commit 71c4997aa17fbd6b4952ee4c7b935a1c7ce26e02. --- ...27204486-Views.js => 1709641962433-Views.js} | 4 ++-- db/migrations/1719227204374-Data.js | 17 ----------------- 2 files changed, 2 insertions(+), 19 deletions(-) rename db/migrations/{1719227204486-Views.js => 1709641962433-Views.js} (91%) delete mode 100644 db/migrations/1719227204374-Data.js diff --git a/db/migrations/1719227204486-Views.js b/db/migrations/1709641962433-Views.js similarity index 91% rename from db/migrations/1719227204486-Views.js rename to db/migrations/1709641962433-Views.js index 893cdf4d7..247ab4dca 100644 --- a/db/migrations/1719227204486-Views.js +++ b/db/migrations/1709641962433-Views.js @@ -1,8 +1,8 @@ const { getViewDefinitions } = require('../viewDefinitions') -module.exports = class Views1719227204486 { - name = 'Views1719227204486' +module.exports = class Views1709641962433 { + name = 'Views1709641962433' async up(db) { const viewDefinitions = getViewDefinitions(db); diff --git a/db/migrations/1719227204374-Data.js b/db/migrations/1719227204374-Data.js deleted file mode 100644 index 6db16e689..000000000 --- a/db/migrations/1719227204374-Data.js +++ /dev/null @@ -1,17 +0,0 @@ -module.exports = class Data1719227204374 { - name = 'Data1719227204374' - - async up(db) { - await db.query(`CREATE TABLE "marketplace_token" ("liquidity" integer, "market_cap" numeric, "cumulative_revenue" numeric, "amm_volume" numeric, "last_day_price_change" numeric, "weekly_liq_change" numeric, "id" character varying NOT NULL, "status" character varying(6) NOT NULL, "avatar" jsonb, "total_supply" numeric NOT NULL, "is_featured" boolean NOT NULL, "symbol" text, "is_invite_only" boolean NOT NULL, "annual_creator_reward_permill" integer NOT NULL, "revenue_share_ratio_permill" integer NOT NULL, "created_at" TIMESTAMP WITH TIME ZONE NOT NULL, "channel_id" text, "description" text, "whitelist_applicant_note" text, "whitelist_applicant_link" text, "accounts_num" integer NOT NULL, "number_of_revenue_share_activations" integer NOT NULL, "deissued" boolean NOT NULL, "current_amm_sale_id" text, "current_sale_id" text, "current_revenue_share_id" text, "number_of_vested_transfer_issued" integer NOT NULL, "last_price" numeric, CONSTRAINT "PK_d836a8c3d907b67099c140c4d84" PRIMARY KEY ("id"))`) - await db.query(`CREATE INDEX "IDX_1268fd020cf195b2e8d5d85093" ON "marketplace_token" ("symbol") `) - await db.query(`CREATE INDEX "IDX_b99bb1ecee77f23016f6ef687c" ON "marketplace_token" ("created_at") `) - await db.query(`CREATE TABLE "orion_offchain_cursor" ("cursor_name" character varying NOT NULL, "value" bigint NOT NULL, CONSTRAINT "PK_7083797352af5a21224b6c8ccbc" PRIMARY KEY ("cursor_name"))`) - } - - async down(db) { - await db.query(`DROP TABLE "marketplace_token"`) - await db.query(`DROP INDEX "public"."IDX_1268fd020cf195b2e8d5d85093"`) - await db.query(`DROP INDEX "public"."IDX_b99bb1ecee77f23016f6ef687c"`) - await db.query(`DROP TABLE "orion_offchain_cursor"`) - } -} From c42d1acca69ca862ab0e27117f9c9c733b1d65d0 Mon Sep 17 00:00:00 2001 From: ikprk Date: Mon, 24 Jun 2024 14:54:19 +0200 Subject: [PATCH 09/13] Second run of `create-migrations` command --- db/migrations/1719233585592-Data.js | 11 +++++++++++ ...{1709641962433-Views.js => 1719233585692-Views.js} | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) create mode 100644 db/migrations/1719233585592-Data.js rename db/migrations/{1709641962433-Views.js => 1719233585692-Views.js} (91%) diff --git a/db/migrations/1719233585592-Data.js b/db/migrations/1719233585592-Data.js new file mode 100644 index 000000000..a9e5c73a3 --- /dev/null +++ b/db/migrations/1719233585592-Data.js @@ -0,0 +1,11 @@ +module.exports = class Data1719233585592 { + name = 'Data1719233585592' + + async up(db) { + await db.query(`CREATE TABLE "orion_offchain_cursor" ("cursor_name" character varying NOT NULL, "value" bigint NOT NULL, CONSTRAINT "PK_7083797352af5a21224b6c8ccbc" PRIMARY KEY ("cursor_name"))`) + } + + async down(db) { + await db.query(`DROP TABLE "orion_offchain_cursor"`) + } +} diff --git a/db/migrations/1709641962433-Views.js b/db/migrations/1719233585692-Views.js similarity index 91% rename from db/migrations/1709641962433-Views.js rename to db/migrations/1719233585692-Views.js index 247ab4dca..e2d477f36 100644 --- a/db/migrations/1709641962433-Views.js +++ b/db/migrations/1719233585692-Views.js @@ -1,8 +1,8 @@ const { getViewDefinitions } = require('../viewDefinitions') -module.exports = class Views1709641962433 { - name = 'Views1709641962433' +module.exports = class Views1719233585692 { + name = 'Views1719233585692' async up(db) { const viewDefinitions = getViewDefinitions(db); From 79451064e1a53779014ca6b964eef83dbfe42363 Mon Sep 17 00:00:00 2001 From: ikprk Date: Wed, 3 Jul 2024 08:39:05 +0200 Subject: [PATCH 10/13] CR fixes --- package-lock.json | 16 ------ package.json | 3 +- src/mappings/content/video.ts | 19 +------ src/utils/OrionVideoLanguageManager.ts | 4 +- .../customMigrations/setOrionLanguage.ts | 56 ------------------- .../setOrionLanguageProvider.ts | 44 ++++++++------- src/utils/language.ts | 31 +--------- 7 files changed, 33 insertions(+), 140 deletions(-) delete mode 100644 src/utils/customMigrations/setOrionLanguage.ts diff --git a/package-lock.json b/package-lock.json index 660052322..50059f6c4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -56,7 +56,6 @@ "patch-package": "^6.5.0", "pg": "8.8.0", "swagger-ui-express": "^4.6.2", - "tinyld": "^1.3.4", "type-graphql": "^1.2.0-rc.1", "typeorm": "^0.3.11", "ua-parser-js": "^1.0.34", @@ -25813,21 +25812,6 @@ "next-tick": "1" } }, - "node_modules/tinyld": { - "version": "1.3.4", - "resolved": "https://registry.npmjs.org/tinyld/-/tinyld-1.3.4.tgz", - "integrity": "sha512-u26CNoaInA4XpDU+8s/6Cq8xHc2T5M4fXB3ICfXPokUQoLzmPgSZU02TAkFwFMJCWTjk53gtkS8pETTreZwCqw==", - "bin": { - "tinyld": "bin/tinyld.js", - "tinyld-heavy": "bin/tinyld-heavy.js", - "tinyld-light": "bin/tinyld-light.js" - }, - "engines": { - "node": ">= 12.10.0", - "npm": ">= 6.12.0", - "yarn": ">= 1.20.0" - } - }, "node_modules/title-case": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/title-case/-/title-case-3.0.3.tgz", diff --git a/package.json b/package.json index fc40befab..33c3e5a96 100644 --- a/package.json +++ b/package.json @@ -88,7 +88,6 @@ "patch-package": "^6.5.0", "pg": "8.8.0", "swagger-ui-express": "^4.6.2", - "tinyld": "^1.3.4", "type-graphql": "^1.2.0-rc.1", "typeorm": "^0.3.11", "ua-parser-js": "^1.0.34", @@ -106,8 +105,8 @@ "@subsquid/substrate-typegen": "^2.1.0", "@subsquid/typeorm-codegen": "0.3.1", "@types/async-lock": "^1.1.3", - "@types/chai": "^4.3.11", "@types/big-json": "^3.2.4", + "@types/chai": "^4.3.11", "@types/cookie-parser": "^1.4.3", "@types/express-rate-limit": "^6.0.0", "@types/mocha": "^10.0.1", diff --git a/src/mappings/content/video.ts b/src/mappings/content/video.ts index d3bde7d7d..7249b2d42 100644 --- a/src/mappings/content/video.ts +++ b/src/mappings/content/video.ts @@ -16,10 +16,7 @@ import { VideoPosted, VideoViewEvent, } from '../../model' -import { VIDEO_ORION_LANGUAGE_CURSOR_NAME } from '../../utils/customMigrations/setOrionLanguageProvider' import { EventHandlerContext } from '../../utils/events' -import { predictVideoLanguage } from '../../utils/language' -import { OrionVideoLanguageManager } from '../../utils/OrionVideoLanguageManager' import { deserializeMetadata, genericEventFields, @@ -125,12 +122,7 @@ export async function processVideoCreatedEvent({ } } - video.orionLanguage = VIDEO_ORION_LANGUAGE_CURSOR_NAME - ? null - : predictVideoLanguage({ - title: video.title ?? '', - description: video.description ?? '', - }) + video.orionLanguage = null channel.totalVideosCreated += 1 @@ -197,14 +189,7 @@ export async function processVideoUpdatedEvent({ ) } - if (VIDEO_ORION_LANGUAGE_CURSOR_NAME) { - orionVideoLanguageManager.scheduleVideoForDetection(video.id) - } else { - video.orionLanguage = predictVideoLanguage({ - title: video.title ?? '', - description: video.description ?? '', - }) - } + orionVideoLanguageManager.scheduleVideoForDetection(video.id) if (autoIssueNft) { await processNft(overlay, block, indexInBlock, extrinsicHash, video, contentActor, autoIssueNft) diff --git a/src/utils/OrionVideoLanguageManager.ts b/src/utils/OrionVideoLanguageManager.ts index d177cb44c..5e059fb27 100644 --- a/src/utils/OrionVideoLanguageManager.ts +++ b/src/utils/OrionVideoLanguageManager.ts @@ -52,7 +52,9 @@ export class OrionVideoLanguageManager { private async updateLoop(intervalMs: number): Promise { const em = await globalEm while (true) { - await this.updateScheduledVideoLanguage(em) + await this.updateScheduledVideoLanguage(em).catch((e) => { + console.log(`Updating scheduled videos Orion language with provider failed`, e) + }) await this.updateOrionVideoLanguage().catch((e) => { console.log(`Updating Orion language with provider failed`, e) }) diff --git a/src/utils/customMigrations/setOrionLanguage.ts b/src/utils/customMigrations/setOrionLanguage.ts deleted file mode 100644 index 79299fe73..000000000 --- a/src/utils/customMigrations/setOrionLanguage.ts +++ /dev/null @@ -1,56 +0,0 @@ -import { EntityManager } from 'typeorm' -import { globalEm } from '../globalEm' -import { predictVideoLanguage } from '../language' - -async function detectVideoLanguage() { - const em: EntityManager = await globalEm - const videos: any[] = await em.query(` - SELECT id, title, description - FROM admin.video - `) - - // Temporary storage for batch update data - const updates: any[] = [] - - for (const [i, video] of videos.entries()) { - const orionLanguage = predictVideoLanguage({ - title: video.title, - description: video.description, - }) - - // Instead of updating immediately, push the update data into the array - updates.push({ orionLanguage, id: video.id }) - console.log(i) - } - - // Define batch size - const batchSize = 1000 // Adjust the batch size based on your database and network performance - - for (let i = 0; i < updates.length; i += batchSize) { - const batch = updates.slice(i, i + batchSize) - - // Prepare the query and parameters for batch update - const query = ` - UPDATE admin.video AS v SET - orion_language = c.orion_language - FROM (VALUES ${batch - .map((_, idx) => `($${idx * 2 + 1}, $${idx * 2 + 2})`) - .join(',')}) AS c(orion_language, id) - WHERE c.id = v.id; - ` - - const queryParams = batch.flatMap((update) => [update.orionLanguage, update.id]) - - // Execute batch update - await em.query(query, queryParams) - } - - console.log(`Updated languages for ${videos.length} videos`) -} - -detectVideoLanguage() - .then(() => console.log('Update process completed.')) - .catch(() => { - console.error('process failed') - process.exit(1) - }) diff --git a/src/utils/customMigrations/setOrionLanguageProvider.ts b/src/utils/customMigrations/setOrionLanguageProvider.ts index 94f2b959e..0ec55a8a7 100644 --- a/src/utils/customMigrations/setOrionLanguageProvider.ts +++ b/src/utils/customMigrations/setOrionLanguageProvider.ts @@ -1,7 +1,7 @@ import { EntityManager } from 'typeorm' import { OrionOffchainCursor } from '../../model' import { globalEm } from '../globalEm' -import { predictLanguageForArray } from '../language' +import { predictLanguageWithProvider } from '../language' const batchSize = 5_000 // Adjust the batch size based on your database and network performance @@ -16,7 +16,7 @@ export const VIDEO_ORION_LANGUAGE_CURSOR_NAME = 'video_orion_language' export async function updateVideoLanguages(em: EntityManager, videos: VideoUpdateType[]) { const mappedVideos = videos.map((video) => `${video.title} ${video.description}`) - const predictionForVideos = await predictLanguageForArray(mappedVideos) + const predictionForVideos = await predictLanguageWithProvider(mappedVideos) const videosWithDetections = videos.map((video, index) => ({ ...video, @@ -40,32 +40,36 @@ export async function updateVideoLanguages(em: EntityManager, videos: VideoUpdat export async function detectVideoLanguageWithProvider() { const em: EntityManager = await globalEm - const cursorEntity: { value: string }[] = await em.query( + let cursorEntity: { value: number }[] = await em.query( `SELECT value FROM orion_offchain_cursor WHERE cursor_name='${VIDEO_ORION_LANGUAGE_CURSOR_NAME}'` ) - const cursor = +(cursorEntity[0]?.value ?? 0) + while (true) { + const cursor = +(cursorEntity[0]?.value ?? 0) - const videos: VideoUpdateType[] = await em.query(` + const videos: VideoUpdateType[] = await em.query(` SELECT id, title, description FROM admin.video ORDER BY id::INTEGER ASC OFFSET ${cursor} LIMIT ${batchSize} - `) + `) - if (!videos.length) { - console.log('No more videos!') - return - } - await updateVideoLanguages(em, videos) - const newCursor = new OrionOffchainCursor({ - cursorName: VIDEO_ORION_LANGUAGE_CURSOR_NAME, - value: cursor + Math.min(batchSize, videos.length), - }) - await em.save(newCursor) - console.log( - `Updated languages for videos in range ${cursor}-${cursor + Math.min(batchSize, videos.length)}` - ) + if (!videos.length) { + console.log('No more videos!') + break + } - await detectVideoLanguageWithProvider() + await updateVideoLanguages(em, videos) + const newCursor = new OrionOffchainCursor({ + cursorName: VIDEO_ORION_LANGUAGE_CURSOR_NAME, + value: cursor + Math.min(batchSize, videos.length), + }) + await em.save(newCursor) + cursorEntity = [newCursor] + console.log( + `Updated languages for videos in range ${cursor}-${ + cursor + Math.min(batchSize, videos.length) + }` + ) + } } diff --git a/src/utils/language.ts b/src/utils/language.ts index 4b74978ef..1846766c6 100644 --- a/src/utils/language.ts +++ b/src/utils/language.ts @@ -1,4 +1,3 @@ -import { detectAll } from 'tinyld' import DetectLanguage from 'detectlanguage' const languageDetectionApiKey = process.env.DETECTLANGUAGE_API_KEY @@ -15,32 +14,8 @@ function cleanString(input: string): string { return cleanedString } -function predictLanguage(text: string): { lang: string; accuracy: number } | undefined { - const cleanedText = cleanString(text) - - // Get the most accurate language prediction - return detectAll(cleanedText)?.[0] -} - -export async function predictLanguageForArray(texts: string[]) { - const result = await languageDetectionInstace.detect(texts) +export async function predictLanguageWithProvider(texts: string[]) { + const cleanedTexts = texts.map(cleanString) + const result = await languageDetectionInstace.detect(cleanedTexts) return result.map((row) => row[0].language) } - -export function predictVideoLanguage({ title, description }: any): string | undefined { - let detectedLang: string | undefined - - const titleLang = predictLanguage(title ?? '') - - detectedLang = titleLang?.lang - - if ((titleLang?.accuracy || 0) < 0.5) { - const titleAndDescriptionLang = predictLanguage(`${title} ${description}`) - if ((titleAndDescriptionLang?.accuracy || 0) > (titleLang?.accuracy || 0)) { - // then - detectedLang = titleAndDescriptionLang?.lang - } - } - - return detectedLang -} From 8311af2e7a45d9164fe5dc7ce0420eddc884a4f0 Mon Sep 17 00:00:00 2001 From: Zeeshan Akram <97m.zeeshan@gmail.com> Date: Wed, 10 Jul 2024 19:50:53 +0500 Subject: [PATCH 11/13] move 'orion_offchain_cursor' table to admin schema --- db/migrations/1719233585592-Data.js | 11 ----------- db/migrations/1720623003671-Data.js | 11 +++++++++++ ...{1719233585692-Views.js => 1720623003800-Views.js} | 4 ++-- src/model/OrionOffchainCursor.ts | 4 ++-- .../customMigrations/setOrionLanguageProvider.ts | 2 +- 5 files changed, 16 insertions(+), 16 deletions(-) delete mode 100644 db/migrations/1719233585592-Data.js create mode 100644 db/migrations/1720623003671-Data.js rename db/migrations/{1719233585692-Views.js => 1720623003800-Views.js} (91%) diff --git a/db/migrations/1719233585592-Data.js b/db/migrations/1719233585592-Data.js deleted file mode 100644 index a9e5c73a3..000000000 --- a/db/migrations/1719233585592-Data.js +++ /dev/null @@ -1,11 +0,0 @@ -module.exports = class Data1719233585592 { - name = 'Data1719233585592' - - async up(db) { - await db.query(`CREATE TABLE "orion_offchain_cursor" ("cursor_name" character varying NOT NULL, "value" bigint NOT NULL, CONSTRAINT "PK_7083797352af5a21224b6c8ccbc" PRIMARY KEY ("cursor_name"))`) - } - - async down(db) { - await db.query(`DROP TABLE "orion_offchain_cursor"`) - } -} diff --git a/db/migrations/1720623003671-Data.js b/db/migrations/1720623003671-Data.js new file mode 100644 index 000000000..5a14f9337 --- /dev/null +++ b/db/migrations/1720623003671-Data.js @@ -0,0 +1,11 @@ +module.exports = class Data1720623003671 { + name = 'Data1720623003671' + + async up(db) { + await db.query(`CREATE TABLE "admin"."orion_offchain_cursor" ("cursor_name" character varying NOT NULL, "value" bigint NOT NULL, CONSTRAINT "PK_7083797352af5a21224b6c8ccbc" PRIMARY KEY ("cursor_name"))`) + } + + async down(db) { + await db.query(`DROP TABLE "admin"."orion_offchain_cursor"`) + } +} diff --git a/db/migrations/1719233585692-Views.js b/db/migrations/1720623003800-Views.js similarity index 91% rename from db/migrations/1719233585692-Views.js rename to db/migrations/1720623003800-Views.js index e2d477f36..b138499f6 100644 --- a/db/migrations/1719233585692-Views.js +++ b/db/migrations/1720623003800-Views.js @@ -1,8 +1,8 @@ const { getViewDefinitions } = require('../viewDefinitions') -module.exports = class Views1719233585692 { - name = 'Views1719233585692' +module.exports = class Views1720623003800 { + name = 'Views1720623003800' async up(db) { const viewDefinitions = getViewDefinitions(db); diff --git a/src/model/OrionOffchainCursor.ts b/src/model/OrionOffchainCursor.ts index 93d8bfa4d..92854164b 100644 --- a/src/model/OrionOffchainCursor.ts +++ b/src/model/OrionOffchainCursor.ts @@ -1,6 +1,6 @@ -import { Entity, Column, PrimaryColumn } from 'typeorm' +import { Column, Entity, PrimaryColumn } from 'typeorm' -@Entity() +@Entity({ schema: 'admin' }) export class OrionOffchainCursor { constructor(props?: Partial) { Object.assign(this, props) diff --git a/src/utils/customMigrations/setOrionLanguageProvider.ts b/src/utils/customMigrations/setOrionLanguageProvider.ts index 0ec55a8a7..e80f2b0e7 100644 --- a/src/utils/customMigrations/setOrionLanguageProvider.ts +++ b/src/utils/customMigrations/setOrionLanguageProvider.ts @@ -41,7 +41,7 @@ export async function updateVideoLanguages(em: EntityManager, videos: VideoUpdat export async function detectVideoLanguageWithProvider() { const em: EntityManager = await globalEm let cursorEntity: { value: number }[] = await em.query( - `SELECT value FROM orion_offchain_cursor WHERE cursor_name='${VIDEO_ORION_LANGUAGE_CURSOR_NAME}'` + `SELECT value FROM admin.orion_offchain_cursor WHERE cursor_name='${VIDEO_ORION_LANGUAGE_CURSOR_NAME}'` ) while (true) { const cursor = +(cursorEntity[0]?.value ?? 0) From 8e2a2d34df3a88b8ff2c0d2fac5eb5bfbc177d4d Mon Sep 17 00:00:00 2001 From: Zeeshan Akram <97m.zeeshan@gmail.com> Date: Wed, 10 Jul 2024 19:52:31 +0500 Subject: [PATCH 12/13] bump package version and add change log --- CHANGELOG.md | 5 +++++ package-lock.json | 4 ++-- package.json | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 598ddafa1..9ae4ec682 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +# 4.0.4 + +## Bug Fixes: +- Fixed: improve the accuracy of `Video.orionLanguage` field by reworking the `predictVideoLanguage` function in `src/utils/language.ts` + # 4.0.3 ## Misc diff --git a/package-lock.json b/package-lock.json index 50059f6c4..a0650bcdd 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "orion", - "version": "4.0.3", + "version": "4.0.4", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "orion", - "version": "4.0.3", + "version": "4.0.4", "hasInstallScript": true, "workspaces": [ "network-tests" diff --git a/package.json b/package.json index 33c3e5a96..f2ec73eda 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "orion", - "version": "4.0.3", + "version": "4.0.4", "engines": { "node": ">=16" }, From 61db7d078f2c2d3fd48e1e6e35240df4e40e447d Mon Sep 17 00:00:00 2001 From: Zeeshan Akram <97m.zeeshan@gmail.com> Date: Wed, 10 Jul 2024 21:03:02 +0500 Subject: [PATCH 13/13] fix: bug in case detected language is undefined --- src/utils/language.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/language.ts b/src/utils/language.ts index 1846766c6..d482ac1f8 100644 --- a/src/utils/language.ts +++ b/src/utils/language.ts @@ -17,5 +17,5 @@ function cleanString(input: string): string { export async function predictLanguageWithProvider(texts: string[]) { const cleanedTexts = texts.map(cleanString) const result = await languageDetectionInstace.detect(cleanedTexts) - return result.map((row) => row[0].language) + return result.map((row) => row[0]?.language) }