From 2dcd4953fe862a03ec29572c0a90bf3f91ad6f5c Mon Sep 17 00:00:00 2001 From: cuixiaorui Date: Sun, 21 Jul 2024 11:43:05 +0800 Subject: [PATCH] fix: parse xingrong pdf data --- packages/xingrong-courses/package.json | 11 ++++-- .../xingrong-courses/src/parsePDF/index.ts | 4 +- .../xingrong-courses/src/parsePDF/inquire.ts | 5 ++- .../src/parsePDF/parser.test.ts | 38 +++++++++++++++++++ .../xingrong-courses/src/parsePDF/parser.ts | 3 +- pnpm-lock.yaml | 6 +++ 6 files changed, 60 insertions(+), 7 deletions(-) diff --git a/packages/xingrong-courses/package.json b/packages/xingrong-courses/package.json index b84fe6082..5c968946c 100644 --- a/packages/xingrong-courses/package.json +++ b/packages/xingrong-courses/package.json @@ -6,16 +6,19 @@ "scripts": { "upload": "tsx src/seed.ts", "course:add": "tsx src/addCourse.ts", - "pdf:parse": "tsx src/parsePDF/index.ts" + "pdf:parse": "tsx src/parsePDF/index.ts", + "test": "vitest" }, "dependencies": { + "@earthworm/db": "workspace:^", "@earthworm/schema": "workspace:^", + "drizzle-orm": "^0.32.0", "inquirer": "^9.2.13", - "pdf-parse": "^1.1.1", - "drizzle-orm": "^0.32.0" + "pdf-parse": "^1.1.1" }, "devDependencies": { "@types/inquirer": "^9.0.7", - "@types/pdf-parse": "^1.1.4" + "@types/pdf-parse": "^1.1.4", + "vitest": "^2.0.3" } } diff --git a/packages/xingrong-courses/src/parsePDF/index.ts b/packages/xingrong-courses/src/parsePDF/index.ts index 8783e6a87..a9254cdff 100644 --- a/packages/xingrong-courses/src/parsePDF/index.ts +++ b/packages/xingrong-courses/src/parsePDF/index.ts @@ -29,7 +29,9 @@ function save(content: string, fileName: string) { function createFileNameMap(): Record { const fileNameMap: Record = {}; - const files = fs.readdirSync(targetPath); + let files = fs.readdirSync(targetPath); + // 筛选出以 .pdf 结尾的文件 + files = files.filter((file) => file.endsWith(".pdf")); files.sort((a, b) => parseFloat(a) - parseFloat(b)); files.forEach((file, index) => { diff --git a/packages/xingrong-courses/src/parsePDF/inquire.ts b/packages/xingrong-courses/src/parsePDF/inquire.ts index 576816dc2..fadcfeaac 100644 --- a/packages/xingrong-courses/src/parsePDF/inquire.ts +++ b/packages/xingrong-courses/src/parsePDF/inquire.ts @@ -39,7 +39,10 @@ export async function inquire(folderPath: string): Promise { } function listAllFiles(folderPath) { - const files = fs.readdirSync(folderPath); + let files = fs.readdirSync(folderPath); + // 筛选出以 .pdf 结尾的文件 + files = files.filter((file) => file.endsWith(".pdf")); + files.sort((a, b) => { return parseFloat(a) - parseFloat(b); }); diff --git a/packages/xingrong-courses/src/parsePDF/parser.test.ts b/packages/xingrong-courses/src/parsePDF/parser.test.ts index 734e961e1..e2717d9eb 100644 --- a/packages/xingrong-courses/src/parsePDF/parser.test.ts +++ b/packages/xingrong-courses/src/parsePDF/parser.test.ts @@ -92,4 +92,42 @@ describe("pdf parser ", () => { ] `); }); + + it("中文部分是括号开始的", () => { + const pdfText = "我 \n" + "I /aɪ/ \n" + "(过去)它;这件事情 \n" + "it /it/ \n"; + + expect(parse(pdfText)).toMatchInlineSnapshot(` + [ + { + "chinese": "我", + "english": "I", + "soundmark": "/aɪ/", + }, + { + "chinese": "(过去)它;这件事情", + "english": "it", + "soundmark": "/it/", + }, + ] + `); + }); + + it("中文部分是英文开始的", () => { + const pdfText = "我 \n" + "I /aɪ/ \n" + "be(ed形式) \n" + "been /bɪn/ \n"; + + expect(parse(pdfText)).toMatchInlineSnapshot(` + [ + { + "chinese": "我", + "english": "I", + "soundmark": "/aɪ/", + }, + { + "chinese": "be(ed形式)", + "english": "been", + "soundmark": "/bɪn/", + }, + ] + `); + }); }); diff --git a/packages/xingrong-courses/src/parsePDF/parser.ts b/packages/xingrong-courses/src/parsePDF/parser.ts index 9131e75e9..83a562b0a 100644 --- a/packages/xingrong-courses/src/parsePDF/parser.ts +++ b/packages/xingrong-courses/src/parsePDF/parser.ts @@ -70,7 +70,8 @@ export function parse(text: string) { } function isChinese(str: string) { - const reg = /^[\u4e00-\u9fa5]/; + // 只要里面有中文的话 那么就作为 chinese 部分 + const reg = /[\u4e00-\u9fa5]/; return reg.test(str); } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 2a9f2e0c3..5b54c3e76 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -364,6 +364,9 @@ importers: packages/xingrong-courses: dependencies: + '@earthworm/db': + specifier: workspace:^ + version: link:../db '@earthworm/schema': specifier: workspace:^ version: link:../schema @@ -383,6 +386,9 @@ importers: '@types/pdf-parse': specifier: ^1.1.4 version: 1.1.4 + vitest: + specifier: ^2.0.3 + version: 2.0.3(@types/node@20.14.9)(happy-dom@13.10.1)(terser@5.31.1) packages: