diff --git a/.github/workflows/assets.yml b/.github/workflows/assets.yml index a4d50dad42..10e8a30ea1 100644 --- a/.github/workflows/assets.yml +++ b/.github/workflows/assets.yml @@ -39,7 +39,7 @@ jobs: id: codebooks working-directory: ./packages/adblocker/ run: | - yarn generate-codebooks + CI=true yarn generate-codebooks - uses: tibdex/github-app-token@v1 id: generate-token diff --git a/packages/adblocker/package.json b/packages/adblocker/package.json index e2aabe209f..f596e6fa6c 100644 --- a/packages/adblocker/package.json +++ b/packages/adblocker/package.json @@ -51,14 +51,14 @@ "dev": "mocha --watch", "bench-metadata": "tsx ./tools/bench-metadata.ts", "bump-internal-engine-version": "tsx ./tools/auto-bump-engine-version.ts", - "codebook-network-csp": "tsx ./tools/generate_compression_codebooks.ts -- network-csp", - "codebook-network-redirect": "tsx ./tools/generate_compression_codebooks.ts -- network-redirect", - "codebook-network-filter": "tsx ./tools/generate_compression_codebooks.ts -- network-filter", - "codebook-network-hostname": "tsx ./tools/generate_compression_codebooks.ts -- network-hostname", - "codebook-cosmetic-selector": "tsx ./tools/generate_compression_codebooks.ts -- cosmetic-selector", - "codebook-raw-network": "tsx ./tools/generate_compression_codebooks.ts -- raw-network", - "codebook-raw-cosmetic": "tsx ./tools/generate_compression_codebooks.ts -- raw-cosmetic", - "generate-codebooks": "concurrently -n build: \"yarn:codebook-*\" && yarn bump-internal-engine-version" + "codebook-network-csp": "tsx ./tools/generate_compression_codebooks.ts network-csp", + "codebook-network-redirect": "tsx ./tools/generate_compression_codebooks.ts network-redirect", + "codebook-network-filter": "tsx ./tools/generate_compression_codebooks.ts network-filter", + "codebook-network-hostname": "tsx ./tools/generate_compression_codebooks.ts network-hostname", + "codebook-cosmetic-selector": "tsx ./tools/generate_compression_codebooks.ts cosmetic-selector", + "codebook-raw-network": "tsx ./tools/generate_compression_codebooks.ts raw-network", + "codebook-raw-cosmetic": "tsx ./tools/generate_compression_codebooks.ts raw-cosmetic", + "generate-codebooks": "tsx ./tools/generate_compression_codebooks_mapper.ts" }, "bugs": { "url": "https://github.com/ghostery/adblocker/issues" @@ -105,7 +105,6 @@ "@types/node": "^22.0.2", "axios": "^1.7.2", "chai": "^5.1.0", - "concurrently": "^9.0.0", "eslint": "^9.3.0", "mocha": "^10.2.0", "nyc": "^17.0.0", diff --git a/packages/adblocker/tools/generate_compression_codebooks.ts b/packages/adblocker/tools/generate_compression_codebooks.ts index f8c347820f..02cdce9131 100644 --- a/packages/adblocker/tools/generate_compression_codebooks.ts +++ b/packages/adblocker/tools/generate_compression_codebooks.ts @@ -114,7 +114,7 @@ function validateCodebook(codebook: string[], strings: string[]): void { }); } -async function generateCodebook(kind: string): Promise { +async function generateCodebook(kind: string, maxNgram?: number): Promise { const strings = await getStrings(kind); let maxSize = 0; for (const string of strings) { @@ -124,22 +124,31 @@ async function generateCodebook(kind: string): Promise { } console.log(`Generate codebook ${kind} using ${strings.length} strings.`); const finetuneNgrams = [1]; - const options = { finetuneNgrams, maxNgram: maxSize, maxRoundsWithNoImprovements: 10 }; - if (kind === 'raw-cosmetic') { - options.maxNgram = 19; - } else if (kind === 'raw-network') { - options.maxNgram = 20; - } else if (kind === 'cosmetic-selector') { - options.maxNgram = 127; - } + const options = { + finetuneNgrams, + maxNgram: maxNgram ?? maxSize, + maxRoundsWithNoImprovements: 10, + }; const codebook = generate(strings, options); validateCodebook(codebook, strings); return codebook; } (async () => { - const kind = process.argv[process.argv.length - 1]; - const codebook = await generateCodebook(kind); + const [kind, maxNgramLiteral] = process.argv.slice(2); + let maxNgram: number | undefined; + if (maxNgramLiteral === undefined) { + if (kind === 'raw-cosmetic') { + maxNgram = 19; + } else if (kind === 'raw-network') { + maxNgram = 20; + } else if (kind === 'cosmetic-selector') { + maxNgram = 127; + } + } else { + maxNgram = parseInt(maxNgramLiteral); + } + const codebook = await generateCodebook(kind, maxNgram); const output = resolve(__dirname, `../src/codebooks/${kind}.ts`); console.log('Updating', output); await fs.writeFile( diff --git a/packages/adblocker/tools/generate_compression_codebooks_mapper.ts b/packages/adblocker/tools/generate_compression_codebooks_mapper.ts new file mode 100644 index 0000000000..e999373802 --- /dev/null +++ b/packages/adblocker/tools/generate_compression_codebooks_mapper.ts @@ -0,0 +1,131 @@ +import { exec } from 'node:child_process'; +import { readFile, writeFile } from 'node:fs/promises'; +import { cpus } from 'node:os'; +import { promisify } from 'node:util'; + +const execPrem = promisify(exec); + +const KINDS = [ + 'network-csp', + 'network-redirect', + 'network-filter', + 'network-hostname', + 'cosmetic-selector', + 'raw-network', + 'raw-cosmetic', +] as const; +type Kind = (typeof KINDS)[number]; + +const SCRIPT_PATH = './tools/generate_compression_codebooks.ts'; + +const IS_CI = typeof process.env['CI'] !== 'undefined'; + +async function runCodebookGeneration(kind: Kind, maxNgram?: number) { + let cmd = `tsx '${SCRIPT_PATH}' '${kind}'`; + if (maxNgram !== undefined) { + cmd += ` '${maxNgram}'`; + } + + try { + await execPrem(cmd); + return false; + } catch (error) { + console.error( + `[ERROR] Failed to generate codebook for the kind of "${kind}" with "maxNgram" of "${maxNgram}"`, + error, + ); + return true; + } +} + +async function getScriptContent() { + return readFile(SCRIPT_PATH, 'utf8'); +} + +async function tryCodebookGeneration(kind: Kind) { + const pattern = new RegExp(`if \\(kind === '${kind}'\\) {\n +maxNgram = (\\d+);`); + const match = pattern.exec(await getScriptContent()); + if (IS_CI === false || match === null) { + console.warn( + `[WARN] Skipping automatic search for maximum "maxNgram" value as looking up pre-defined "maxNgram" value for the kind "${kind}" failed or the environment variable "CI" was not set!`, + ); + + return runCodebookGeneration(kind); + } + + const [fullMatch, maxNgramLiteral] = match; + let maxNgramSize = parseInt(maxNgramLiteral, 10); + + for (;;) { + console.log(`[INFO] Trying "maxNgram" of "${maxNgramSize}" for the kind "${kind}"...`); + + const crashed = await runCodebookGeneration(kind, maxNgramSize); + if (crashed === false) { + break; + } + + --maxNgramSize; + } + + const foundMaxNgramLiteral = maxNgramSize.toString(); + if (maxNgramLiteral !== foundMaxNgramLiteral) { + await writeFile( + SCRIPT_PATH, + (await getScriptContent()).replace( + fullMatch, + fullMatch.replace(maxNgramLiteral, foundMaxNgramLiteral), + ), + 'utf8', + ); + } + + return; +} + +type Task = (...args: any[]) => Promise; + +class Threads { + tasks: Task[] = []; + processes: number = 0; + + constructor(readonly maxConcurrency: number = cpus().length) { + console.log(`[INFO] Limiting maximum concurrency to "${maxConcurrency}"...`); + } + + public enqueue(task: Task) { + this.tasks.push(task); + + // Create process which will continue to consume tasks until they run out. + if (this.processes >= this.maxConcurrency) { + return; + } + void this.process(); + } + + private async process() { + this.processes++; + + while (this.tasks.length > 0) { + const task = this.tasks.shift(); + if (task === undefined) { + break; + } + + await task(); + } + + this.processes--; + } +} + +void (async function () { + const threads = new Threads(); + + for (const kind of KINDS) { + async function task() { + tryCodebookGeneration(kind); + } + + threads.enqueue(task); + } +})(); diff --git a/yarn.lock b/yarn.lock index 16a1f0ef44..5c253610e2 100644 --- a/yarn.lock +++ b/yarn.lock @@ -948,7 +948,6 @@ __metadata: "@types/node": "npm:^22.0.2" axios: "npm:^1.7.2" chai: "npm:^5.1.0" - concurrently: "npm:^9.0.0" eslint: "npm:^9.3.0" mocha: "npm:^10.2.0" nyc: "npm:^17.0.0" @@ -3790,24 +3789,6 @@ __metadata: languageName: node linkType: hard -"concurrently@npm:^9.0.0": - version: 9.1.0 - resolution: "concurrently@npm:9.1.0" - dependencies: - chalk: "npm:^4.1.2" - lodash: "npm:^4.17.21" - rxjs: "npm:^7.8.1" - shell-quote: "npm:^1.8.1" - supports-color: "npm:^8.1.1" - tree-kill: "npm:^1.2.2" - yargs: "npm:^17.7.2" - bin: - conc: dist/bin/concurrently.js - concurrently: dist/bin/concurrently.js - checksum: 10/9ed158095a6dcb30b2fbc00f173a993d080b6eca466b8005650b051f82618991ba8ae76b39c41ae78780f83846154e5ec434753f402aa4401acf0ffb2422e1cf - languageName: node - linkType: hard - "console-control-strings@npm:^1.1.0": version: 1.1.0 resolution: "console-control-strings@npm:1.1.0" @@ -9243,7 +9224,7 @@ __metadata: languageName: node linkType: hard -"rxjs@npm:^7.5.5, rxjs@npm:^7.8.1": +"rxjs@npm:^7.5.5": version: 7.8.1 resolution: "rxjs@npm:7.8.1" dependencies: @@ -9389,13 +9370,6 @@ __metadata: languageName: node linkType: hard -"shell-quote@npm:^1.8.1": - version: 1.8.1 - resolution: "shell-quote@npm:1.8.1" - checksum: 10/af19ab5a1ec30cb4b2f91fd6df49a7442d5c4825a2e269b3712eded10eedd7f9efeaab96d57829880733fc55bcdd8e9b1d8589b4befb06667c731d08145e274d - languageName: node - linkType: hard - "signal-exit@npm:3.0.7, signal-exit@npm:^3.0.2, signal-exit@npm:^3.0.3": version: 3.0.7 resolution: "signal-exit@npm:3.0.7" @@ -10082,15 +10056,6 @@ __metadata: languageName: node linkType: hard -"tree-kill@npm:^1.2.2": - version: 1.2.2 - resolution: "tree-kill@npm:1.2.2" - bin: - tree-kill: cli.js - checksum: 10/49117f5f410d19c84b0464d29afb9642c863bc5ba40fcb9a245d474c6d5cc64d1b177a6e6713129eb346b40aebb9d4631d967517f9fbe8251c35b21b13cd96c7 - languageName: node - linkType: hard - "treeverse@npm:^3.0.0": version: 3.0.0 resolution: "treeverse@npm:3.0.0"