-
Notifications
You must be signed in to change notification settings - Fork 2
59 lines (58 loc) · 2.15 KB
/
manual.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
name: Manual execution
on:
workflow_dispatch:
inputs:
page_from:
description: the page to start from
default: 1
required: true
page_to:
description: the page to latest end on
default: 1000
required: true
modified_from:
description: collect only papers at this date or older
default: 2023-04-29
required: true
modified_to:
description: collect only papers at this date or younger
default: 2023-04-30
required: true
jobs:
scrape-papers:
name: Scraper paper content
runs-on: ubuntu-latest
environment: scrape_papers_envs
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'
- name: Install deps
run: pip install -r requirements.txt
- name: Download paper JSONs
run: python3 ./1_read_paper_json.py --page_from ${{ github.event.inputs.page_from }} --page_to ${{ github.event.inputs.page_to }} --modified_to ${{ github.event.inputs.modified_to }} --modified_from ${{ github.event.inputs.modified_from }}
- name: Download PDFs from URL contained main file field in paper JSON data entries
run: python3 ./2_download_pdfs.py
- name: Extract text from PDF files (and put it in a txt file for each PDF)
run: python3 ./3_txt_extraction.py
- name: Concat import JSON file from the contents of the paper JSONs and the extracted text content
run: python3 ./4_srm_import.py
- name: Upload artifact
uses: actions/upload-artifact@v3
with:
name: import-json
path: input.json
retention-days: 1
- uses: Dylan700/sftp-upload-action@latest
with:
server: ${{ secrets.FTP_URL }}
username: ${{ secrets.FTP_USER }}
password: ${{ secrets.FTP_PASSWORD }}
uploads: |
./ => ./
ignore: |
!input.json
- name: Trigger import
run: curl --write-out '%{http_code}' --silent --output /dev/null -X POST ${{ secrets.IMPORT_URL }}?secret=${{ secrets.SHARED_IMPORT_SECRET }}