Skip to content

Commit

Permalink
fix: parse xingrong pdf data
Browse files Browse the repository at this point in the history
  • Loading branch information
cuixiaorui committed Jul 21, 2024
1 parent e5eef48 commit 2dcd495
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 7 deletions.
11 changes: 7 additions & 4 deletions packages/xingrong-courses/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,19 @@
"scripts": {
"upload": "tsx src/seed.ts",
"course:add": "tsx src/addCourse.ts",
"pdf:parse": "tsx src/parsePDF/index.ts"
"pdf:parse": "tsx src/parsePDF/index.ts",
"test": "vitest"
},
"dependencies": {
"@earthworm/db": "workspace:^",
"@earthworm/schema": "workspace:^",
"drizzle-orm": "^0.32.0",
"inquirer": "^9.2.13",
"pdf-parse": "^1.1.1",
"drizzle-orm": "^0.32.0"
"pdf-parse": "^1.1.1"
},
"devDependencies": {
"@types/inquirer": "^9.0.7",
"@types/pdf-parse": "^1.1.4"
"@types/pdf-parse": "^1.1.4",
"vitest": "^2.0.3"
}
}
4 changes: 3 additions & 1 deletion packages/xingrong-courses/src/parsePDF/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ function save(content: string, fileName: string) {

function createFileNameMap(): Record<string, string> {
const fileNameMap: Record<string, string> = {};
const files = fs.readdirSync(targetPath);
let files = fs.readdirSync(targetPath);
// 筛选出以 .pdf 结尾的文件
files = files.filter((file) => file.endsWith(".pdf"));
files.sort((a, b) => parseFloat(a) - parseFloat(b));

files.forEach((file, index) => {
Expand Down
5 changes: 4 additions & 1 deletion packages/xingrong-courses/src/parsePDF/inquire.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,10 @@ export async function inquire(folderPath: string): Promise<string[]> {
}

function listAllFiles(folderPath) {
const files = fs.readdirSync(folderPath);
let files = fs.readdirSync(folderPath);
// 筛选出以 .pdf 结尾的文件
files = files.filter((file) => file.endsWith(".pdf"));

files.sort((a, b) => {
return parseFloat(a) - parseFloat(b);
});
Expand Down
38 changes: 38 additions & 0 deletions packages/xingrong-courses/src/parsePDF/parser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,42 @@ describe("pdf parser ", () => {
]
`);
});

it("中文部分是括号开始的", () => {
const pdfText = "我 \n" + "I /aɪ/ \n" + "(过去)它;这件事情 \n" + "it /it/ \n";

expect(parse(pdfText)).toMatchInlineSnapshot(`
[
{
"chinese": "我",
"english": "I",
"soundmark": "/aɪ/",
},
{
"chinese": "(过去)它;这件事情",
"english": "it",
"soundmark": "/it/",
},
]
`);
});

it("中文部分是英文开始的", () => {
const pdfText = "我 \n" + "I /aɪ/ \n" + "be(ed形式) \n" + "been /bɪn/ \n";

expect(parse(pdfText)).toMatchInlineSnapshot(`
[
{
"chinese": "我",
"english": "I",
"soundmark": "/aɪ/",
},
{
"chinese": "be(ed形式)",
"english": "been",
"soundmark": "/bɪn/",
},
]
`);
});
});
3 changes: 2 additions & 1 deletion packages/xingrong-courses/src/parsePDF/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ export function parse(text: string) {
}

function isChinese(str: string) {
const reg = /^[\u4e00-\u9fa5]/;
// 只要里面有中文的话 那么就作为 chinese 部分
const reg = /[\u4e00-\u9fa5]/;
return reg.test(str);
}

Expand Down
6 changes: 6 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 2dcd495

Please sign in to comment.