forked from wwd29/arxiv-daily
-
Notifications
You must be signed in to change notification settings - Fork 0
/
rss_daily.py
105 lines (88 loc) · 3.5 KB
/
rss_daily.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import random
import time
from datetime import datetime
import feedparser
import requests
from md2html import md2html
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67',
'Connection': 'close'
}
keywords = [
'pointcloud', 'railway', 'BIM', 'procedural modeling', 'segmentation', '3D point cloud', 'railway infrastructure',
'point cloud segmentation', 'extraction', 'Lidar', 'Infrastructure information models'
]
categories = ['cs.CV', 'cs.CL', 'cs.CR', 'cs.LG']
proxies={'https':'127.0.0.1:10809'}
def find_keyword(summary):
summary = summary.lower()
for keyword in keywords:
if keyword in summary:
return keyword
return None
def check_title(title):
words = title.split('(')[-1].split()
if len(words) > 2:
return False
category = words[1][1:6]
if category in categories:
return True
return False
def get_code_url(short_id):
base_url = 'https://arxiv.paperswithcode.com/api/v0/repos-and-datasets/'
time.sleep(random.random())
data = requests.get(base_url + short_id, headers=headers,proxies=proxies).json()
if data and 'code' in data:
if data['code'] and 'official' in data['code']:
if data['code']['official'] and 'url' in data['code']['official']:
return data['code']['official']['url']
return None
def main():
rss_addr = 'http://export.arxiv.org/rss/'
paper_ids = set()
keywords_bin = {k: list() for k in keywords}
for category in categories:
data = feedparser.parse(f'{rss_addr}{category}')
if data and hasattr(data, 'entries') and len(data.entries) > 0:
for entry in data.entries:
if entry.id in paper_ids:
continue
if not check_title(entry.title):
continue
keyword = find_keyword(entry.title)
if keyword is None:
keyword = find_keyword(entry.summary)
if keyword is None:
continue
item = '### Title: {}\n'.format(entry.title)
item += '* Paper URL: [{}]({})\n'.format(
entry.link, entry.link)
code_url = get_code_url(entry.id.split('/')[-1])
if code_url is not None:
item += f'* Code URL: [{code_url}]({code_url})\n'
else:
item += f'* Code URL: null\n'
item += f'* Copy Paste: `<input type="checkbox">[[{entry.link.split("/")[-1]}] {entry.title.split(".")[0]}]({entry.link}) #{keyword}`\n'
item += f'* Summary: {entry.summary}\n\n'
keywords_bin[keyword].append(item)
paper_ids.add(entry.id)
now = datetime.utcnow()
with open('README.md', 'w') as fp:
fp.write(f'# arxiv-daily\n')
fp.write(f'updated on {now}\n')
fp.write(f'| keyword | count |\n')
fp.write(f'| - | - |\n')
for keyword in keywords:
fp.write(f'| {keyword} | {len(keywords_bin[keyword])} |\n')
os.makedirs('rss/', exist_ok=True)
file = '{}.md'.format(datetime.strftime(now, '%Y-%m-%d'))
with open(f'rss/{file}', 'w', buffering=1) as fp:
for keyword in keywords:
fp.write(f'## {keyword}\n')
for item in keywords_bin[keyword]:
fp.write(item)
md2html(file, 'rss', 'html')
if __name__ == '__main__':
main()