-
Notifications
You must be signed in to change notification settings - Fork 0
/
testbaseball.py
36 lines (32 loc) · 1.39 KB
/
testbaseball.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from bs4 import BeautifulSoup
import requests
from utils import korean_character_ratio
# body = open('44.html')
title = ""
result = []
url = 'https://sports.news.naver.com/kbaseball/news/index?isphoto=N&page=10'
body = requests.get(url).text
print(body)
with open("links.html", "a") as f:
f.write(body)
f.close()
soup = BeautifulSoup(requests.get(url), 'html.parser')
title = soup.title.string
# TODO: 지금 받아온애가 descentandts가 nil임.
big_body = soup.find('div', "tt_article_useless_p_margin") # Tag
tags = big_body.find_all('p', attrs={'data-ke-size': 'size16'})
for tag in tags:
# if tag['class'] == 'another_category another_category_color_gray': # 카테고리쪽이라 pass
# continue
if tag.string != None:
stripped = tag.string.replace("\xa0", "").strip()
# print(stripped)
if len(stripped) == 0:
continue
if korean_character_ratio(stripped, ignore_whitespace=True) < 0.5 :
# print("한글 컨텐츠가 아님")
continue
if stripped == '타이틀 시작' or stripped == '타이틀 종료' or stripped == '소제목1 종료' or stripped == '소제목1 시작' or stripped == '소제목2 시작' or stripped == '소제목2 종료' or stripped == '마무리 시작' or stripped == '마무리 종료':
continue
if stripped is not None:
result.append(stripped)