-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 771b8ad
Showing
12 changed files
with
272 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
/*.html | ||
/*.exe | ||
/out/ | ||
/target/ |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<groupId>io.github.lovelyjuice</groupId> | ||
<artifactId>jdread-downloader</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
|
||
<properties> | ||
<maven.compiler.source>11</maven.compiler.source> | ||
<maven.compiler.target>11</maven.compiler.target> | ||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | ||
</properties> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>org.seleniumhq.selenium</groupId> | ||
<artifactId>selenium-java</artifactId> | ||
<version>4.4.0</version> | ||
</dependency> | ||
</dependencies> | ||
|
||
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
package io.github.lovelyjuice; | ||
|
||
import org.openqa.selenium.By; | ||
import org.openqa.selenium.NoSuchElementException; | ||
import org.openqa.selenium.chrome.ChromeDriver; | ||
import org.openqa.selenium.support.ui.WebDriverWait; | ||
|
||
import java.io.FileNotFoundException; | ||
import java.io.PrintWriter; | ||
import java.time.Duration; | ||
import java.util.LinkedHashSet; | ||
import java.util.Scanner; | ||
import java.util.stream.Collectors; | ||
|
||
public class Main { | ||
public static void main(String[] args) throws FileNotFoundException { | ||
System.setProperty("webdriver.chrome.driver", "chromedriver.exe"); | ||
ChromeDriver driver = new ChromeDriver(); | ||
driver.get("https://passport.jd.com/new/login.aspx?ReturnUrl=https%3A%2F%2Febooks.jd.com%2Fbookshelf"); | ||
System.out.println("操作浏览器登录并跳转到书籍第一页后,输入任意字符开始,输入q退出:"); | ||
while (!(new Scanner(System.in).next()).equals("q")) { | ||
try { | ||
var contentBuilder = new StringBuilder("<body>"); | ||
var styleSet = new LinkedHashSet<String>(); | ||
var readerPage = new ReaderPage(driver); | ||
var bookName = readerPage.bookName.getAttribute("innerText"); | ||
System.out.printf("开始爬取《%s》%n", bookName); | ||
int debugTimes = 0; | ||
String debugFlag = System.getenv("debug"); | ||
while (true) { | ||
if (debugTimes++ > 6 && debugFlag != null) break; //调试的时候只爬取6章 | ||
String contentHtml = readerPage.content.getAttribute("outerHTML"); | ||
System.out.println(contentHtml.split("reader-chapter-content")[1].substring(0, 50)); | ||
contentBuilder.append(contentHtml); | ||
styleSet.addAll(readerPage.styleSheetList.stream() //有些书不同章节有不同的css样式,所以试着合并这些css | ||
.map(webElement -> webElement.getAttribute("outerHTML")) | ||
.collect(Collectors.toList())); | ||
System.out.println("-----------------------"); | ||
try { | ||
readerPage.nextChapterMiddleButton.click(); | ||
} catch (NoSuchElementException e) { | ||
break; //找不到“下一章”按钮说明已经浏览到最后一章 | ||
} | ||
new WebDriverWait(driver, Duration.ofSeconds(300)) | ||
.until(a -> a.findElement(By.cssSelector("div.reader-chapter-content"))); //“下一章”按钮在正文之前被渲染出来,所以只要渲染出按钮就可以跳转到下一章 | ||
} | ||
contentBuilder.append("</body></html>"); | ||
bookName = bookName.replace(":", ":").replace("?", "?").replace("\"", "“") | ||
.replaceAll("[\\\\\\/\\*<>\\|]", "_"); //防止书名中存在Windows不允许的文件名字符 | ||
PrintWriter writer = new PrintWriter(bookName + ".html"); | ||
String staticStylesheet = "<style>.reader-chapter-content>* {" + | ||
" word-wrap: break-all;" + | ||
" margin-top: 18px;" + | ||
" text-align: justify;" + | ||
" word-break: break-word;" + | ||
"}</style>"; // main.css中阅读区域的默认样式 | ||
writer.write("<html><head>" + staticStylesheet + String.join("\n", styleSet) + "</head>"); | ||
String content = contentBuilder.toString().replace("min-height: ;", "") | ||
.replace("; height: \"", ";\"") //京东前端写的css缺少属性值,不删掉的话转换epub时会报错,不过其实报错问题也不大 | ||
.replaceAll("min-width:(.*?);", ""); //解除图片最小宽度限制,小屏设备也能轻松查看,但是对于百分比宽度的图片无效 | ||
writer.write(content); | ||
writer.close(); | ||
System.out.printf("《%s》下载完成!%n", bookName); | ||
} catch (Exception e) { | ||
System.out.println(e); | ||
System.out.println("出错了,请重试!"); | ||
} | ||
System.out.println("跳转到书籍第一页后,输入任意字符开始,输入q退出:"); | ||
} | ||
driver.quit(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
package io.github.lovelyjuice; | ||
|
||
import org.openqa.selenium.WebDriver; | ||
import org.openqa.selenium.WebElement; | ||
import org.openqa.selenium.support.FindBy; | ||
import org.openqa.selenium.support.PageFactory; | ||
import java.util.List; | ||
|
||
public class ReaderPage { | ||
|
||
public ReaderPage(WebDriver driver) { | ||
PageFactory.initElements(driver, this); | ||
} | ||
|
||
|
||
@FindBy(css = "div.reader-chapter-content") | ||
public WebElement content; | ||
|
||
@FindBy(css = "button.nextChapter") | ||
public WebElement nextChapterMiddleButton; | ||
|
||
@FindBy(css = "head > link[rel='stylesheet']") | ||
public List<WebElement> styleSheetList; | ||
|
||
@FindBy(css = "title") | ||
public WebElement bookName; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Manifest-Version: 1.0 | ||
Main-Class: io.github.lovelyjuice.Main | ||
|