Skip to content

Commit

Permalink
1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
lovelyjuice committed Nov 4, 2022
0 parents commit 771b8ad
Show file tree
Hide file tree
Showing 12 changed files with 272 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/*.html
/*.exe
/out/
/target/
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

73 changes: 73 additions & 0 deletions .idea/artifacts/jdread_downloader_jar.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions .idea/compiler.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/encodings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions .idea/jarRepositories.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 25 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>io.github.lovelyjuice</groupId>
<artifactId>jdread-downloader</artifactId>
<version>1.0-SNAPSHOT</version>

<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>4.4.0</version>
</dependency>
</dependencies>

</project>
72 changes: 72 additions & 0 deletions src/main/java/io/github/lovelyjuice/Main.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package io.github.lovelyjuice;

import org.openqa.selenium.By;
import org.openqa.selenium.NoSuchElementException;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.support.ui.WebDriverWait;

import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.time.Duration;
import java.util.LinkedHashSet;
import java.util.Scanner;
import java.util.stream.Collectors;

public class Main {
public static void main(String[] args) throws FileNotFoundException {
System.setProperty("webdriver.chrome.driver", "chromedriver.exe");
ChromeDriver driver = new ChromeDriver();
driver.get("https://passport.jd.com/new/login.aspx?ReturnUrl=https%3A%2F%2Febooks.jd.com%2Fbookshelf");
System.out.println("操作浏览器登录并跳转到书籍第一页后,输入任意字符开始,输入q退出:");
while (!(new Scanner(System.in).next()).equals("q")) {
try {
var contentBuilder = new StringBuilder("<body>");
var styleSet = new LinkedHashSet<String>();
var readerPage = new ReaderPage(driver);
var bookName = readerPage.bookName.getAttribute("innerText");
System.out.printf("开始爬取《%s》%n", bookName);
int debugTimes = 0;
String debugFlag = System.getenv("debug");
while (true) {
if (debugTimes++ > 6 && debugFlag != null) break; //调试的时候只爬取6章
String contentHtml = readerPage.content.getAttribute("outerHTML");
System.out.println(contentHtml.split("reader-chapter-content")[1].substring(0, 50));
contentBuilder.append(contentHtml);
styleSet.addAll(readerPage.styleSheetList.stream() //有些书不同章节有不同的css样式,所以试着合并这些css
.map(webElement -> webElement.getAttribute("outerHTML"))
.collect(Collectors.toList()));
System.out.println("-----------------------");
try {
readerPage.nextChapterMiddleButton.click();
} catch (NoSuchElementException e) {
break; //找不到“下一章”按钮说明已经浏览到最后一章
}
new WebDriverWait(driver, Duration.ofSeconds(300))
.until(a -> a.findElement(By.cssSelector("div.reader-chapter-content"))); //“下一章”按钮在正文之前被渲染出来,所以只要渲染出按钮就可以跳转到下一章
}
contentBuilder.append("</body></html>");
bookName = bookName.replace(":", ":").replace("?", "?").replace("\"", "“")
.replaceAll("[\\\\\\/\\*<>\\|]", "_"); //防止书名中存在Windows不允许的文件名字符
PrintWriter writer = new PrintWriter(bookName + ".html");
String staticStylesheet = "<style>.reader-chapter-content>* {" +
" word-wrap: break-all;" +
" margin-top: 18px;" +
" text-align: justify;" +
" word-break: break-word;" +
"}</style>"; // main.css中阅读区域的默认样式
writer.write("<html><head>" + staticStylesheet + String.join("\n", styleSet) + "</head>");
String content = contentBuilder.toString().replace("min-height: ;", "")
.replace("; height: \"", ";\"") //京东前端写的css缺少属性值,不删掉的话转换epub时会报错,不过其实报错问题也不大
.replaceAll("min-width:(.*?);", ""); //解除图片最小宽度限制,小屏设备也能轻松查看,但是对于百分比宽度的图片无效
writer.write(content);
writer.close();
System.out.printf("《%s》下载完成!%n", bookName);
} catch (Exception e) {
System.out.println(e);
System.out.println("出错了,请重试!");
}
System.out.println("跳转到书籍第一页后,输入任意字符开始,输入q退出:");
}
driver.quit();
}
}
27 changes: 27 additions & 0 deletions src/main/java/io/github/lovelyjuice/ReaderPage.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package io.github.lovelyjuice;

import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.support.FindBy;
import org.openqa.selenium.support.PageFactory;
import java.util.List;

public class ReaderPage {

public ReaderPage(WebDriver driver) {
PageFactory.initElements(driver, this);
}


@FindBy(css = "div.reader-chapter-content")
public WebElement content;

@FindBy(css = "button.nextChapter")
public WebElement nextChapterMiddleButton;

@FindBy(css = "head > link[rel='stylesheet']")
public List<WebElement> styleSheetList;

@FindBy(css = "title")
public WebElement bookName;
}
3 changes: 3 additions & 0 deletions src/main/resources/META-INF/MANIFEST.MF
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Manifest-Version: 1.0
Main-Class: io.github.lovelyjuice.Main

0 comments on commit 771b8ad

Please sign in to comment.