-
Notifications
You must be signed in to change notification settings - Fork 0
/
docker-compose.yml
111 lines (99 loc) · 2.78 KB
/
docker-compose.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
version: "3.3"
services:
arangodb:
image: arangodb:3.7.3
ports:
- 8529:8529
volumes:
- arangodb_data:/var/lib/arango
environment:
ARANGO_ROOT_PASSWORD: "${ARANGO_ROOT_PASSWORD}"
zookeeper:
image: wurstmeister/zookeeper
ports:
- "2181:2181"
kafka:
build:
context: kafka/
ports:
- "9092:9092"
depends_on:
- zookeeper
labels:
com.htw.export-logs: "true"
environment:
KAFKA_LISTENERS: PLAINTEXT://:9092
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
KAFKA_CREATE_TOPICS: "tablecollector:2:1" # name:partition:replicas
KAFKA_AUTO_CREATE_TOPICS_ENABLE: "false"
EXTRA_ARGS: -javaagent:/usr/app/jmx_prometheus_javaagent.jar=7099:/usr/app/prom-jmx-agent-config.yml
arangodb-test:
image: arangodb:3.7.3
tmpfs: /var/lib/arango
restart: always
environment:
ARANGO_NO_AUTH: 1
core: &core
build: .
stop_grace_period: 10m
labels:
com.htw.export-logs: "true"
database-ingestor:
<<: *core
environment:
KAFKA_BOOTSTRAP_SERVERS: 'kafka:9092'
KAFKA_TOPIC: 'tablecollector'
ARANGO_HOST: http://arangodb:8529
ARANGO_CREDENTIALS: '{"username":"root","password":"${ARANGO_ROOT_PASSWORD}"}'
ORANGE_CLIENT_ID: "${ORANGE_CLIENT_ID}"
ORANGE_CLIENT_SECRET: "${ORANGE_CLIENT_SECRET}"
command: ['python', '-m', 'ingestion.kafka_consumer']
spider:
<<: *core
environment:
URL_FILE: "data/test-urls.txt"
FOLLOW_LINKS: "false" # "true" for deep crawls
KAFKA_EXPORT: "true"
CRAWL_ONCE: "true"
LOG_LEVEL: "INFO"
command: ["scrapy", "crawl", "web", "-s", "JOBDIR=.scrapy/crawls"]
volumes:
- spider_data:/srv/app/.scrapy
cc-spider:
<<: *core
environment:
URL_FILE: data/wikipedia_urls.csv
KAFKA_EXPORT: "true"
CRAWL_ONCE: "true"
LOG_LEVEL: "INFO"
command: ["scrapy", "crawl", "common_crawl_table_parser", "-s", "JOBDIR=.scrapy/crawls"]
volumes:
- cc-spider_data:/srv/app/.scrapy/
test:
<<: *core
command: ["python", "-m", "pytest"]
environment:
ARANGO_HOST: http://arangodb-test:8529
depends_on:
- arangodb-test
schema_test:
<<: *core
command: ["scripts/schema_test.sh"]
lint:
<<: *core
environment:
REQUIRED_QUALITY: 9
command: ["pipenv", "run", "lint"]
postprocess:
<<: *core
environment:
ARANGO_HOST: http://arangodb:8529
ARANGO_CREDENTIALS: '{"username":"root","password":"${ARANGO_ROOT_PASSWORD}"}'
ORANGE_CLIENT_ID: "${ORANGE_CLIENT_ID}"
ORANGE_CLIENT_SECRET: "${ORANGE_CLIENT_SECRET}"
command: ["python", 'postprocessing/postprocess.py']
volumes:
arangodb_data:
spider_data:
cc-spider_data: