From 771b8ad125561d8261e655b5c7fae6bab7974475 Mon Sep 17 00:00:00 2001 From: wenqh Date: Fri, 4 Nov 2022 11:48:47 +0800 Subject: [PATCH] 1.0 --- .gitignore | 4 + .idea/.gitignore | 8 ++ .idea/artifacts/jdread_downloader_jar.xml | 73 +++++++++++++++++++ .idea/compiler.xml | 13 ++++ .idea/encodings.xml | 7 ++ .idea/jarRepositories.xml | 20 +++++ .idea/misc.xml | 14 ++++ .idea/vcs.xml | 6 ++ pom.xml | 25 +++++++ src/main/java/io/github/lovelyjuice/Main.java | 72 ++++++++++++++++++ .../io/github/lovelyjuice/ReaderPage.java | 27 +++++++ src/main/resources/META-INF/MANIFEST.MF | 3 + 12 files changed, 272 insertions(+) create mode 100644 .gitignore create mode 100644 .idea/.gitignore create mode 100644 .idea/artifacts/jdread_downloader_jar.xml create mode 100644 .idea/compiler.xml create mode 100644 .idea/encodings.xml create mode 100644 .idea/jarRepositories.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/vcs.xml create mode 100644 pom.xml create mode 100644 src/main/java/io/github/lovelyjuice/Main.java create mode 100644 src/main/java/io/github/lovelyjuice/ReaderPage.java create mode 100644 src/main/resources/META-INF/MANIFEST.MF diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2be1c43 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/*.html +/*.exe +/out/ +/target/ diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/artifacts/jdread_downloader_jar.xml b/.idea/artifacts/jdread_downloader_jar.xml new file mode 100644 index 0000000..a407129 --- /dev/null +++ b/.idea/artifacts/jdread_downloader_jar.xml @@ -0,0 +1,73 @@ + + + $PROJECT_DIR$/out/artifacts/jdread_downloader_jar + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/compiler.xml b/.idea/compiler.xml new file mode 100644 index 0000000..d466afe --- /dev/null +++ b/.idea/compiler.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 0000000..aa00ffa --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/.idea/jarRepositories.xml b/.idea/jarRepositories.xml new file mode 100644 index 0000000..712ab9d --- /dev/null +++ b/.idea/jarRepositories.xml @@ -0,0 +1,20 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..accd629 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,14 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..dd59f22 --- /dev/null +++ b/pom.xml @@ -0,0 +1,25 @@ + + + 4.0.0 + + io.github.lovelyjuice + jdread-downloader + 1.0-SNAPSHOT + + + 11 + 11 + UTF-8 + + + + + org.seleniumhq.selenium + selenium-java + 4.4.0 + + + + \ No newline at end of file diff --git a/src/main/java/io/github/lovelyjuice/Main.java b/src/main/java/io/github/lovelyjuice/Main.java new file mode 100644 index 0000000..9139586 --- /dev/null +++ b/src/main/java/io/github/lovelyjuice/Main.java @@ -0,0 +1,72 @@ +package io.github.lovelyjuice; + +import org.openqa.selenium.By; +import org.openqa.selenium.NoSuchElementException; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.support.ui.WebDriverWait; + +import java.io.FileNotFoundException; +import java.io.PrintWriter; +import java.time.Duration; +import java.util.LinkedHashSet; +import java.util.Scanner; +import java.util.stream.Collectors; + +public class Main { + public static void main(String[] args) throws FileNotFoundException { + System.setProperty("webdriver.chrome.driver", "chromedriver.exe"); + ChromeDriver driver = new ChromeDriver(); + driver.get("https://passport.jd.com/new/login.aspx?ReturnUrl=https%3A%2F%2Febooks.jd.com%2Fbookshelf"); + System.out.println("操作浏览器登录并跳转到书籍第一页后,输入任意字符开始,输入q退出:"); + while (!(new Scanner(System.in).next()).equals("q")) { + try { + var contentBuilder = new StringBuilder(""); + var styleSet = new LinkedHashSet(); + var readerPage = new ReaderPage(driver); + var bookName = readerPage.bookName.getAttribute("innerText"); + System.out.printf("开始爬取《%s》%n", bookName); + int debugTimes = 0; + String debugFlag = System.getenv("debug"); + while (true) { + if (debugTimes++ > 6 && debugFlag != null) break; //调试的时候只爬取6章 + String contentHtml = readerPage.content.getAttribute("outerHTML"); + System.out.println(contentHtml.split("reader-chapter-content")[1].substring(0, 50)); + contentBuilder.append(contentHtml); + styleSet.addAll(readerPage.styleSheetList.stream() //有些书不同章节有不同的css样式,所以试着合并这些css + .map(webElement -> webElement.getAttribute("outerHTML")) + .collect(Collectors.toList())); + System.out.println("-----------------------"); + try { + readerPage.nextChapterMiddleButton.click(); + } catch (NoSuchElementException e) { + break; //找不到“下一章”按钮说明已经浏览到最后一章 + } + new WebDriverWait(driver, Duration.ofSeconds(300)) + .until(a -> a.findElement(By.cssSelector("div.reader-chapter-content"))); //“下一章”按钮在正文之前被渲染出来,所以只要渲染出按钮就可以跳转到下一章 + } + contentBuilder.append(""); + bookName = bookName.replace(":", ":").replace("?", "?").replace("\"", "“") + .replaceAll("[\\\\\\/\\*<>\\|]", "_"); //防止书名中存在Windows不允许的文件名字符 + PrintWriter writer = new PrintWriter(bookName + ".html"); + String staticStylesheet = ""; // main.css中阅读区域的默认样式 + writer.write("" + staticStylesheet + String.join("\n", styleSet) + ""); + String content = contentBuilder.toString().replace("min-height: ;", "") + .replace("; height: \"", ";\"") //京东前端写的css缺少属性值,不删掉的话转换epub时会报错,不过其实报错问题也不大 + .replaceAll("min-width:(.*?);", ""); //解除图片最小宽度限制,小屏设备也能轻松查看,但是对于百分比宽度的图片无效 + writer.write(content); + writer.close(); + System.out.printf("《%s》下载完成!%n", bookName); + } catch (Exception e) { + System.out.println(e); + System.out.println("出错了,请重试!"); + } + System.out.println("跳转到书籍第一页后,输入任意字符开始,输入q退出:"); + } + driver.quit(); + } +} \ No newline at end of file diff --git a/src/main/java/io/github/lovelyjuice/ReaderPage.java b/src/main/java/io/github/lovelyjuice/ReaderPage.java new file mode 100644 index 0000000..48f8762 --- /dev/null +++ b/src/main/java/io/github/lovelyjuice/ReaderPage.java @@ -0,0 +1,27 @@ +package io.github.lovelyjuice; + +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.support.FindBy; +import org.openqa.selenium.support.PageFactory; +import java.util.List; + +public class ReaderPage { + + public ReaderPage(WebDriver driver) { + PageFactory.initElements(driver, this); + } + + + @FindBy(css = "div.reader-chapter-content") + public WebElement content; + + @FindBy(css = "button.nextChapter") + public WebElement nextChapterMiddleButton; + + @FindBy(css = "head > link[rel='stylesheet']") + public List styleSheetList; + + @FindBy(css = "title") + public WebElement bookName; +} \ No newline at end of file diff --git a/src/main/resources/META-INF/MANIFEST.MF b/src/main/resources/META-INF/MANIFEST.MF new file mode 100644 index 0000000..2935936 --- /dev/null +++ b/src/main/resources/META-INF/MANIFEST.MF @@ -0,0 +1,3 @@ +Manifest-Version: 1.0 +Main-Class: io.github.lovelyjuice.Main +