Skip to content

Commit

Permalink
feat: 替换 HTML 解析实现
Browse files Browse the repository at this point in the history
  • Loading branch information
FHU-yezi committed Mar 30, 2024
1 parent 9a21b72 commit fb82d34
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 10 deletions.
9 changes: 3 additions & 6 deletions jkit/article.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from datetime import datetime
from enum import Enum
from re import sub as re_sub
from typing import (
TYPE_CHECKING,
Any,
Expand All @@ -13,8 +12,6 @@
)

from httpx import HTTPStatusError
from lxml.html import HtmlElement
from lxml.html import fromstring as parse_html

from jkit._base import (
DATA_OBJECT_CONFIG,
Expand All @@ -30,6 +27,7 @@
normalize_percentage,
)
from jkit.config import CONFIG
from jkit.constants import BLANK_LINES_REGEX, HTML_TAG_REGEX
from jkit.exceptions import ResourceUnavailableError
from jkit.identifier_check import is_article_slug
from jkit.identifier_convert import article_slug_to_url, article_url_to_slug
Expand Down Expand Up @@ -109,9 +107,8 @@ class ArticleInfo(DataObject, **DATA_OBJECT_CONFIG):

@property
def text_content(self) -> str:
html_obj: HtmlElement = parse_html(self.html_content)
result = "".join(html_obj.itertext()) # type: ignore
return re_sub(r"\s{3,}", "", result) # 去除多余的空行
result = HTML_TAG_REGEX.sub("", self.html_content)
return BLANK_LINES_REGEX.sub("\n", result)


class ArticleAudioInfo(DataObject, **DATA_OBJECT_CONFIG):
Expand Down
12 changes: 8 additions & 4 deletions jkit/constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from re import compile as regex_compile

USER_NAME_REGEX = regex_compile(r"^[\w]*$")
MAX_ID = 10**9

HTML_TAG_REGEX = regex_compile("<.*?>")
BLANK_LINES_REGEX = regex_compile("\n{2,}")

JIANSHU_URL_REGEX = regex_compile(r"^https://www\.jianshu\.com/[a-zA-Z0-9/]*/?$")
USER_UPLOADED_URL_REGEX = regex_compile(r"^https?:\/\/.*/?$")
Expand All @@ -14,9 +17,10 @@

USER_SLUG_REGEX = regex_compile(r"^[a-zA-Z0-9]{6,12}$")
ARTICLE_SLUG_REGEX = regex_compile(r"^[a-zA-Z0-9]{12}$")
NOTEBOOK_ID_MIN = 10000000
NOTEBOOK_ID_MAX = 99999999
COLLECTION_SLUG_REGEX = regex_compile(r"^[a-zA-Z0-9]{6,12}$")
ISLAND_SLUG_REGEX = regex_compile(r"^[a-zA-Z0-9]{16}$")

MAX_ID = 10**9
NOTEBOOK_ID_MIN = 10000000
NOTEBOOK_ID_MAX = 99999999

USER_NAME_REGEX = regex_compile(r"^[\w]*$")

0 comments on commit fb82d34

Please sign in to comment.