diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..819e9c8 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @uetchy diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..572c138 --- /dev/null +++ b/.gitignore @@ -0,0 +1,130 @@ +.vscode +package-lock.json +yarn.lock +.envrc +/lib + +# Created by https://www.toptal.com/developers/gitignore/api/node +# Edit at https://www.toptal.com/developers/gitignore?templates=node + +### Node ### +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +lerna-debug.log* +.pnpm-debug.log* + +# Diagnostic reports (https://nodejs.org/api/report.html) +report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json + +# Runtime data +pids +*.pid +*.seed +*.pid.lock + +# Directory for instrumented libs generated by jscoverage/JSCover +lib-cov + +# Coverage directory used by tools like istanbul +coverage +*.lcov + +# nyc test coverage +.nyc_output + +# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) +.grunt + +# Bower dependency directory (https://bower.io/) +bower_components + +# node-waf configuration +.lock-wscript + +# Compiled binary addons (https://nodejs.org/api/addons.html) +build/Release + +# Dependency directories +node_modules/ +jspm_packages/ + +# Snowpack dependency directory (https://snowpack.dev/) +web_modules/ + +# TypeScript cache +*.tsbuildinfo + +# Optional npm cache directory +.npm + +# Optional eslint cache +.eslintcache + +# Microbundle cache +.rpt2_cache/ +.rts2_cache_cjs/ +.rts2_cache_es/ +.rts2_cache_umd/ + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# Yarn Integrity file +.yarn-integrity + +# dotenv environment variables file +.env +.env.test +.env.production + +# parcel-bundler cache (https://parceljs.org/) +.cache +.parcel-cache + +# Next.js build output +.next +out + +# Nuxt.js build / generate output +.nuxt +dist + +# Gatsby files +.cache/ +# Comment in the public line in if your project uses Gatsby and not Next.js +# https://nextjs.org/blog/next-9-1#public-directory-support +# public + +# vuepress build output +.vuepress/dist + +# Serverless directories +.serverless/ + +# FuseBox cache +.fusebox/ + +# DynamoDB Local files +.dynamodb/ + +# TernJS port file +.tern-port + +# Stores VSCode versions used for testing VSCode extensions +.vscode-test + +# yarn v2 +.yarn/cache +.yarn/unplugged +.yarn/build-state.yml +.yarn/install-state.gz +.pnp.* + +# End of https://www.toptal.com/developers/gitignore/api/node diff --git a/.prettierrc b/.prettierrc new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/.prettierrc @@ -0,0 +1 @@ +{} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..e69de29 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..204272b --- /dev/null +++ b/LICENSE @@ -0,0 +1,207 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, and + distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by the + copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all other + entities that control, are controlled by, or are under common control with + that entity. For the purposes of this definition, "control" means (i) the + power, direct or indirect, to cause the direction or management of such + entity, whether by contract or otherwise, or (ii) ownership of + fifty percent (50%) or more of the outstanding shares, or (iii) beneficial + ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity exercising + permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation source, + and configuration files. + + "Object" form shall mean any form resulting from mechanical transformation + or translation of a Source form, including but not limited to compiled + object code, generated documentation, and conversions to + other media types. + + "Work" shall mean the work of authorship, whether in Source or Object + form, made available under the License, as indicated by a copyright notice + that is included in or attached to the work (an example is provided in the + Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object form, + that is based on (or derived from) the Work and for which the editorial + revisions, annotations, elaborations, or other modifications represent, + as a whole, an original work of authorship. For the purposes of this + License, Derivative Works shall not include works that remain separable + from, or merely link (or bind by name) to the interfaces of, the Work and + Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including the original + version of the Work and any modifications or additions to that Work or + Derivative Works thereof, that is intentionally submitted to Licensor for + inclusion in the Work by the copyright owner or by an individual or + Legal Entity authorized to submit on behalf of the copyright owner. + For the purposes of this definition, "submitted" means any form of + electronic, verbal, or written communication sent to the Licensor or its + representatives, including but not limited to communication on electronic + mailing lists, source code control systems, and issue tracking systems + that are managed by, or on behalf of, the Licensor for the purpose of + discussing and improving the Work, but excluding communication that is + conspicuously marked or otherwise designated in writing by the copyright + owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity on + behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. + + Subject to the terms and conditions of this License, each Contributor + hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, + royalty-free, irrevocable copyright license to reproduce, prepare + Derivative Works of, publicly display, publicly perform, sublicense, + and distribute the Work and such Derivative Works in + Source or Object form. + +3. Grant of Patent License. + + Subject to the terms and conditions of this License, each Contributor + hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, + royalty-free, irrevocable (except as stated in this section) patent + license to make, have made, use, offer to sell, sell, import, and + otherwise transfer the Work, where such license applies only to those + patent claims licensable by such Contributor that are necessarily + infringed by their Contribution(s) alone or by combination of their + Contribution(s) with the Work to which such Contribution(s) was submitted. + If You institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work or a + Contribution incorporated within the Work constitutes direct or + contributory patent infringement, then any patent licenses granted to + You under this License for that Work shall terminate as of the date such + litigation is filed. + +4. Redistribution. + + You may reproduce and distribute copies of the Work or Derivative Works + thereof in any medium, with or without modifications, and in Source or + Object form, provided that You meet the following conditions: + + 1. You must give any other recipients of the Work or Derivative Works a + copy of this License; and + + 2. You must cause any modified files to carry prominent notices stating + that You changed the files; and + + 3. You must retain, in the Source form of any Derivative Works that You + distribute, all copyright, patent, trademark, and attribution notices from + the Source form of the Work, excluding those notices that do not pertain + to any part of the Derivative Works; and + + 4. If the Work includes a "NOTICE" text file as part of its distribution, + then any Derivative Works that You distribute must include a readable copy + of the attribution notices contained within such NOTICE file, excluding + those notices that do not pertain to any part of the Derivative Works, + in at least one of the following places: within a NOTICE text file + distributed as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, within a + display generated by the Derivative Works, if and wherever such + third-party notices normally appear. The contents of the NOTICE file are + for informational purposes only and do not modify the License. + You may add Your own attribution notices within Derivative Works that You + distribute, alongside or as an addendum to the NOTICE text from the Work, + provided that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and may + provide additional or different license terms and conditions for use, + reproduction, or distribution of Your modifications, or for any such + Derivative Works as a whole, provided Your use, reproduction, and + distribution of the Work otherwise complies with the conditions + stated in this License. + +5. Submission of Contributions. + + Unless You explicitly state otherwise, any Contribution intentionally + submitted for inclusion in the Work by You to the Licensor shall be under + the terms and conditions of this License, without any additional + terms or conditions. Notwithstanding the above, nothing herein shall + supersede or modify the terms of any separate license agreement you may + have executed with Licensor regarding such Contributions. + +6. Trademarks. + + This License does not grant permission to use the trade names, trademarks, + service marks, or product names of the Licensor, except as required for + reasonable and customary use in describing the origin of the Work and + reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + + Unless required by applicable law or agreed to in writing, Licensor + provides the Work (and each Contributor provides its Contributions) + on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + either express or implied, including, without limitation, any warranties + or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS + FOR A PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any risks + associated with Your exercise of permissions under this License. + +8. Limitation of Liability. + + In no event and under no legal theory, whether in tort + (including negligence), contract, or otherwise, unless required by + applicable law (such as deliberate and grossly negligent acts) or agreed + to in writing, shall any Contributor be liable to You for damages, + including any direct, indirect, special, incidental, or consequential + damages of any character arising as a result of this License or out of + the use or inability to use the Work (including but not limited to damages + for loss of goodwill, work stoppage, computer failure or malfunction, + or any and all other commercial damages or losses), even if such + Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + + While redistributing the Work or Derivative Works thereof, You may choose + to offer, and charge a fee for, acceptance of support, warranty, + indemnity, or other liability obligations and/or rights consistent with + this License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf of any + other Contributor, and only if You agree to indemnify, defend, and hold + each Contributor harmless for any liability incurred by, or claims + asserted against, such Contributor by reason of your accepting any such + warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work + + To apply the Apache License to your work, attach the following boilerplate + notice, with the fields enclosed by brackets "[]" replaced with your own + identifying information. (Don't include the brackets!) The text should be + enclosed in the appropriate comment syntax for the file format. We also + recommend that a file or class name and description of purpose be included + on the same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2021 Yasuaki Uechi + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + or implied. See the License for the specific language governing + permissions and limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..fdf4ef5 --- /dev/null +++ b/README.md @@ -0,0 +1,45 @@ +# Spamreaper + +PoC of BWT-RLE based sentence complexity score. Specifically made for handling multiple chats in live streams. + +## Use + +```bash +npm install spamreaper +``` + +```js +import { isSpam } from "spamreaper"; + +const safe = [ + "oh", + "nice!", + "lol", + "looool", + "kusa", + "lol", + "lol", + "lol", + "lol", + "lol", +]; + +const spam = [ + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", +]; + +isSpam(safe); // => false +isSpam(spam); // => true +``` + +## Roadmap + +- [ ] Test with a more diverse set of examples. diff --git a/jest.config.js b/jest.config.js new file mode 100644 index 0000000..a93551f --- /dev/null +++ b/jest.config.js @@ -0,0 +1,4 @@ +module.exports = { + preset: "ts-jest", + testEnvironment: "node", +}; diff --git a/package.json b/package.json new file mode 100644 index 0000000..f13c4da --- /dev/null +++ b/package.json @@ -0,0 +1,47 @@ +{ + "name": "spamreaper", + "description": "PoC of BWT-RLE based sentence complexity score. Specifically made for handling multiple chats in live streams.", + "version": "0.0.0", + "author": "Yasuaki Uechi (https://uechi.io/)", + "scripts": { + "build": "run-p tsup:*", + "clean": "shx rm -rf lib", + "dev": "yarn tsup:cjs --watch", + "prepublishOnly": "npm run clean && npm run build", + "test": "jest", + "tsup:cjs": "tsup src/index.ts -d lib/cjs --format cjs --dts --minify", + "tsup:esm": "tsup src/index.ts -d lib/esm --format esm --minify" + }, + "types": "lib/cjs/index.d.ts", + "module": "lib/esm/index.js", + "main": "lib/cjs/index.js", + "files": [ + "lib" + ], + "devDependencies": { + "@types/jest": "^26.0.24", + "@types/node": "^16.0.1", + "jest": "^27.0.6", + "npm-run-all": "^4.1.5", + "shx": "^0.3.3", + "ts-jest": "^27.0.3", + "tsup": "^4.12.5", + "typescript": "^4.3.5" + }, + "homepage": "https://github.com/holodata/spamreaper", + "repository": { + "type": "git", + "url": "https://github.com/holodata/spamreaper.git" + }, + "bugs": { + "url": "https://github.com/holodata/spamreaper/issues" + }, + "license": "Apache-2.0", + "keywords": [ + "antispam", + "spam" + ], + "engines": { + "node": ">= 12.18.3" + } +} diff --git a/src/index.ts b/src/index.ts new file mode 100644 index 0000000..b23f554 --- /dev/null +++ b/src/index.ts @@ -0,0 +1,48 @@ +function bonafideMod(i: number, j: number) { + return ((i % j) + j) % j; +} + +function suffixArray(input: string): number[] { + const charArr = Array.from(input).map((v, i) => [v, i]) as [string, number][]; + charArr.sort((a, b) => (a[0] > b[0] ? 1 : -1)); + return charArr.map((r) => r[1]); +} + +function bwt(input: string) { + const ref = suffixArray(input); + return ref.map((x) => input[bonafideMod(x - 1, input.length)]).join(""); +} + +// Popped out from GitHub Copilot (wow) +function rle(input: string) { + const compressed = []; + let count = 1; + for (let i = 1; i < input.length; i++) { + if (input[i] === input[i - 1]) { + count++; + } else { + compressed.push(input[i - 1] + count.toString()); + count = 1; + } + } + compressed.push(input[input.length - 1] + count.toString()); + return compressed.join(""); +} + +// the bigger the number, the more likely it is to be spam. +// judging it around 0.5 will give you a reasonably good result. +export function sentenceComplexityScore(input: string) { + const bwt2 = bwt(bwt(input)); + const rleStr = rle(bwt2); + const score = rleStr.length / 2; + return -1 * Math.log(score / input.length); +} + +export function concatinatedScore(inputs: string[]) { + return sentenceComplexityScore(inputs.join("")); +} + +export function isSpam(inputs: string[], threshold: number = 0.65) { + const score = concatinatedScore(inputs); + return score > threshold; +} diff --git a/tests/index.test.ts b/tests/index.test.ts new file mode 100644 index 0000000..5b13a34 --- /dev/null +++ b/tests/index.test.ts @@ -0,0 +1,54 @@ +import { isSpam } from ".."; + +it("spam", async () => { + const safe1 = [ + "ina living the dream", + "not only is she drawing cute anime girls", + "she personally knows them", + "and is getting paid to do it", + ]; + const safe2 = [ + "oh", + "nice!", + "lol", + "looool", + "kusa", + "lol", + "lol", + "lol", + "lol", + "lol", + ]; + const safe3 = ["ina living the dream"]; + const spam1 = [ + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", + "kapan nyanyi lagi?", + ]; + const spam2 = [ + "くそくそくそくそくそくそくそくそくそくそ", + "くそくそくそくそくそくそくそくそくそくそ", + "くそくそくそくそくそくそくそくそくそくそ", + "くそくそくそくそくそくそくそくそくそくそ", + "くそくそくそくそくそくそくそくそくそくそ", + "くそくそくそくそくそくそくそくそくそくそ", + "くそくそくそくそくそくそくそくそくそくそ", + "くそくそくそくそくそくそくそくそくそくそ", + "くそくそくそくそくそくそくそくそくそくそ", + "くそくそくそくそくそくそくそくそくそくそ", + ]; + const spam3 = ["konpeko konpeko konpeko", "konpeko", "konpeko konpeko"]; + + expect(isSpam(safe1)).toBe(false); + expect(isSpam(safe2)).toBe(false); + expect(isSpam(safe3)).toBe(false); + expect(isSpam(spam1)).toBe(true); + expect(isSpam(spam2)).toBe(true); + expect(isSpam(spam3)).toBe(true); +}); diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..3870c3b --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,73 @@ +{ + "exclude": ["tests", "lib"], + "compilerOptions": { + /* Visit https://aka.ms/tsconfig.json to read more about this file */ + + /* Basic Options */ + // "incremental": true, /* Enable incremental compilation */ + "target": "es2019", /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019', 'ES2020', 'ES2021', or 'ESNEXT'. */ + "module": "commonjs", /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', 'es2020', or 'ESNext'. */ + // "lib": [], /* Specify library files to be included in the compilation. */ + // "allowJs": true, /* Allow javascript files to be compiled. */ + // "checkJs": true, /* Report errors in .js files. */ + // "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', 'react', 'react-jsx' or 'react-jsxdev'. */ + "declaration": true, /* Generates corresponding '.d.ts' file. */ + // "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */ + // "sourceMap": true, /* Generates corresponding '.map' file. */ + // "outFile": "./", /* Concatenate and emit output to single file. */ + "outDir": "./lib", /* Redirect output structure to the directory. */ + "rootDir": "./src", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */ + // "composite": true, /* Enable project compilation */ + // "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */ + // "removeComments": true, /* Do not emit comments to output. */ + // "noEmit": true, /* Do not emit outputs. */ + // "importHelpers": true, /* Import emit helpers from 'tslib'. */ + // "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */ + // "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */ + + /* Strict Type-Checking Options */ + "strict": true, /* Enable all strict type-checking options. */ + // "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */ + // "strictNullChecks": true, /* Enable strict null checks. */ + // "strictFunctionTypes": true, /* Enable strict checking of function types. */ + // "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */ + // "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */ + // "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */ + // "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */ + + /* Additional Checks */ + // "noUnusedLocals": true, /* Report errors on unused locals. */ + // "noUnusedParameters": true, /* Report errors on unused parameters. */ + // "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */ + // "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */ + // "noUncheckedIndexedAccess": true, /* Include 'undefined' in index signature results */ + // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an 'override' modifier. */ + // "noPropertyAccessFromIndexSignature": true, /* Require undeclared properties from index signatures to use element accesses. */ + + /* Module Resolution Options */ + // "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */ + // "baseUrl": "./", /* Base directory to resolve non-absolute module names. */ + // "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */ + // "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */ + // "typeRoots": [], /* List of folders to include type definitions from. */ + // "types": [], /* Type declaration files to be included in compilation. */ + // "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */ + "esModuleInterop": true, /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */ + // "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */ + // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ + + /* Source Map Options */ + // "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */ + // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ + // "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */ + // "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */ + + /* Experimental Options */ + // "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */ + // "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */ + + /* Advanced Options */ + "skipLibCheck": true, /* Skip type checking of declaration files. */ + "forceConsistentCasingInFileNames": true /* Disallow inconsistently-cased references to the same file. */ + } +}