-
Notifications
You must be signed in to change notification settings - Fork 15
/
covid_plot_tests.py
855 lines (792 loc) · 39 KB
/
covid_plot_tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
import io
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from dateutil.parser import parse as d
import utils_thai
from covid_data_api import ihme_dataset
from covid_data_testing import get_variant_api
from covid_plot_utils import plot_area
from covid_plot_utils import source
from utils_pandas import import_csv
from utils_pandas import perc_format
from utils_pandas import rearrange
from utils_pandas import topprov
from utils_scraping import any_in
from utils_scraping import logger
from utils_thai import area_crosstab
from utils_thai import AREA_LEGEND
from utils_thai import AREA_LEGEND_ORDERED
from utils_thai import AREA_LEGEND_SIMPLE
from utils_thai import DISTRICT_RANGE
from utils_thai import DISTRICT_RANGE_SIMPLE
from utils_thai import FIRST_AREAS
from utils_thai import join_provinces
from utils_thai import trend_table
# Eyeballed from the plots for sequenced varaints in the reports
est_variants = """
week,BA.1 (Omicron),BA.2 (Omicron)
100, 100, 0
101, 100, 0
102, 100, 0
103, 100, 0
104, 100, 0
105, 97, 3
106, 97, 3
107, 95, 5
108, 92, 8
109, 90, 10
110, 80, 20
111, 75, 25
112, 70, 30
113, 45, 55
114, 40, 60
115, 16, 83
116, 15, 85
117, 14, 86
118, 8, 92
119, 4, 96
120, 3, 97
"""
# https://github.com/neherlab/SARS-CoV-2_variant-reports/blob/main/reports/variant_report_latest_draft.md
# 'B.1.1.7 (Alpha)'
# 'B.1.351 (Beta)'
# 'B.1.617.2 (Delta)'
groups = {
'B.1.36.16': 'B.1.36.16 (สมุทรสาคร)',
'B.1.1': 'B.1.1.7 (Alpha)',
'B.1.351': 'B.1.351 (Beta)',
'B.1.617.2': 'B.1.617.2 (Delta)',
'AY.': 'B.1.617.2 (Delta)',
"BA.1": "BA.1 (Omicron)",
"BA.2": "BA.2 (Omicron)",
"BA.4": "BA.4/BA.5 (Omicron)",
"BA.5": "BA.4/BA.5 (Omicron)",
"DY.": "BA.4/BA.5 (Omicron)",
"CK.": "BA.4/BA.5 (Omicron)",
"BQ.": "BA.4/BA.5 (Omicron)",
"CN.": "BA.4/BA.5 (Omicron)",
"ED.": "BA.4/BA.5 (Omicron)",
"EZ.": "BA.4/BA.5 (Omicron)",
"FM.": "BA.4/BA.5 (Omicron)",
"BA.2.75": "BA.2.75/BN.1/CH.1 (Centaurus)",
"BA.2.76": "BA.2.75/BN.1/CH.1 (Centaurus)",
"BN.": "BA.2.75/BN.1/CH.1 (Centaurus)",
"CH.": "BA.2.75/BN.1/CH.1 (Centaurus)",
"BR.": "BA.2.75/BN.1/CH.1 (Centaurus)",
"FK.": "BA.2.75/BN.1/CH.1 (Centaurus)",
"DV.": "BA.2.75/BN.1/CH.1 (Centaurus)",
"EJ.": "BA.2.75/BN.1/CH.1 (Centaurus)",
"FY.": "XBB (Kraken/Arcturus)", # see https://github.com/MurrellGroup/lineages
"FU.": "XBB (Kraken/Arcturus)",
"EQ.": "XBB (Kraken/Arcturus)",
"EM.": "XBB (Kraken/Arcturus)",
"EU.": "XBB (Kraken/Arcturus)",
"EK.": "XBB (Kraken/Arcturus)",
"FL.": "XBB (Kraken/Arcturus)",
"XBB": "XBB (Kraken/Arcturus)",
"GY.": "XBB (Kraken/Arcturus)",
"GJ.": "XBB (Kraken/Arcturus)",
"GS.": "XBB (Kraken/Arcturus)",
"GE.": "XBB (Kraken/Arcturus)",
"GA.": "XBB (Kraken/Arcturus)",
"HK.3": "XBB (Kraken/Arcturus)", # from eg.5.1?
"HN.": "XBB (Kraken/Arcturus)",
"EG.": "Other", # "EG (Eris)", - didn't have much impact
# https://www.news-medical.net/news/20240209/Understanding-the-virological-properties-of-SARS-CoV-2-variant-JN1.aspx
"BA.2.86": "BA.2.86/JN.1 (Pirola)",
"JN.": "BA.2.86/JN.1 (Pirola)",
"KP.1.1": "KP.* (FLiRT)",
"KP.2": "KP.* (FLiRT)",
"KP.": "KP.* (FLiRT)",
"LB.1": "KP.* (FLiRT)",
"KS.1": "KP.* (FLiRT)",
# BA.2.86 recombinants - XDQ: BA.2.86.1 with 3' from FL.15.1.1, lacks S:L455S as not from JN.1 but sublineage XDQ.1 has S:A475V. Around 5% at end of January 2024 in South Korea and Japan, possibly growing.
# "XDQ": "XDQ",
# XDV Recombinant lineage of XDE, JN.1, XDE, JN.1 (breakpoints between 19327-21608, 27916-28296, 28959-29534), USA/China, from #2402
# XDV.1 S:F456L (T22930A), on C11572T branch, Hong Kong
"Other": "Other",
}
def group_seq(seq):
def group(variant):
label = next((label for match, label in reversed(groups.items()) if variant.upper().startswith(match)), "Other")
return label
recent_variants = seq.iloc[-4:].dropna(axis=1, how='all')
unstacked = seq.unstack().reset_index(name="Detected").rename(columns=dict(level_0="Variant"))
unstacked['Variant Group'] = unstacked['Variant'].apply(group)
seq = pd.pivot_table(unstacked, columns="Variant Group", values="Detected", index="End", aggfunc="sum")
seq = seq.apply(lambda x: x / x.sum(), axis=1)
# Put them back in the order above
# seq = seq[dict(zip(groups.values(), [1] * len(groups))).keys()]
# seq.columns = [c + " (Omicron)" for c in seq.columns]
return seq
def combined_variant_reports(min_samples=20):
# Vartiants
# sequence data have less of but more detail
seq = import_csv("variants_sequenced", index=["End"], date_cols=["End"])
seq = seq.fillna(0)
seq = seq[seq.sum(axis=1) >= min_samples] # If not enough samples we won't use it
# Group into major categories, BA.2 vs BA.1
seq = group_seq(seq)
# add in manual values
mseq = pd.read_csv(io.StringIO(est_variants))
mseq['End'] = (mseq['week'] * 7).apply(lambda x: pd.DateOffset(x) + d("2019-12-27"))
mseq = mseq.set_index("End").drop(columns=["week"])
mseq = mseq / 100
seq = seq.combine_first(mseq)
# last_data = seq.index.max() # Sequence data is behind genotyping. Lets not interpolate past best data we have
variants = import_csv("variants", index=["End"], date_cols=["End"])
variants = variants.fillna(0)
variants = variants.rename(columns={'B.1.1.529 (Omicron)': 'BA.1 (Omicron)'})
variants = variants.apply(lambda x: x / x.sum(), axis=1)
# seq is all omicron variants
allseq = seq.multiply(variants["BA.1 (Omicron)"], axis=0)
seq = allseq.combine_first(seq.loc["2023-01-20":])
seq = seq.rename(columns={'Other (Omicron)': 'Other'}) # Now includes BQ.X
# TODO: missing seq data results in all BA.1. so either need a other omicron or nan data after date we are sure its not all BA1
variants.loc["2021-12-24":, 'BA.1 (Omicron)'] = np.nan
# fill in leftover dates with SNP genotyping data (major varient types)
variants = seq.combine_first(variants)
# This is the PCR based survalience. Less detailed but more samples and 1 week ahead of sequencing.
area = import_csv("variants_by_area", index=["Start", "End"], date_cols=["Start", "End"])
area = area.groupby(["Start", "End"]).sum()
area = area.reset_index().drop(columns=["Health Area", "Start"]).set_index(
"End").rename(columns={"B.1.1.529 (Omicron)": "Other", "BA.2.75 (Omicron)": "BA.2.75/BN.1/CH.1 (Centaurus)"})
area = area.apply(lambda x: x / x.sum(), axis=1)
# Omicron didn't get spit out until 2022-06-24 so get rid of the rest
# TODO: should we prefer seq data or pcr data?
variants = variants.combine_first(area["2022-06-24":])
last_data = variants['BA.2 (Omicron)'].last_valid_index()
variants = variants.reindex(pd.date_range(variants.index.min(), last_data, freq='D')).interpolate()
return variants
def save_variant_plots(df: pd.DataFrame) -> None:
variants = combined_variant_reports(min_samples=19)
api = get_variant_api(other_threshold=0.0, nday_threshold=1)
api = api.resample("W-SAT", label='right', closed='right').mean()
# api = api.rolling("7d").mean()
# api = api[api.sum(axis=1) > 5] # If not enough samples we won't use it
if not api.empty and variants.index.max() <= api.index.max():
# api seems not have alpha beta. maybe because not enough sequence data then?
variants = pd.concat([variants[:"2022-01-01"], group_seq(api)["2022-01-02":]])
variants = variants.replace(np.nan, 0)
foot_source = f'{source}Data Source: GISAID'
else:
logger.warning("Using Variants from reports. GISAID problem, or old")
foot_source = f'{source}Data Source: SARS-CoV-2 variants in Thailand(DMSc)'
last_update = variants.last_valid_index()
# Interpolate up to current data
# TODO put in different colour to show its a prediction
variants = variants.reindex(pd.date_range(df.index.min(), df.index.max(), freq='D')).interpolate()
footnote = "Estimate of variants in {} based on random sampling\nof Case PCR Genetic sequencing submitted to GISAID."
cols = rearrange(variants.columns.to_list(), *dict.fromkeys(groups.values()).keys())
cols = rearrange(cols, "Other", first=False)
variants['Cases'] = df['Cases']
case_variants = (variants[cols].multiply(variants['Cases'], axis=0)).dropna(axis=0, how="all")
# cols = sorted(variants.columns, key=lambda c: c.split("(")[1])
plot_area(df=case_variants,
title='Cases by Major Variant - Interpolated from Sampling - Thailand',
png_prefix='cases_by_variants', cols_subset=cols,
ma_days=7,
kind='area', stacked=True, percent_fig=True,
cmap='tab10',
# y_formatter=perc_format,
last_update=last_update,
footnote=footnote.format("Cases"),
footnote_left=foot_source)
ihme = ihme_dataset(check=False)
today = df['Cases'].index.max()
#est_cases = ihme["inf_mean"].loc[:today].to_frame("Estimated Total Infections (IHME)")
inf_variants = (variants[cols].multiply(ihme['inf_mean'], axis=0)).dropna(axis=0, how="all")
# cols = sorted(variants.columns, key=lambda c: c.split("(")[1])
plot_area(df=inf_variants,
title='Est. Infections by Major Variant - Interpolated from Sampling - Thailand',
png_prefix='inf_by_variants', cols_subset=cols,
ma_days=7,
kind='area', stacked=True, percent_fig=True,
cmap='tab10',
# y_formatter=perc_format,
last_update=last_update,
footnote=footnote.format("IHME Infections Prediction"),
footnote_left=foot_source)
death_variants = (variants[cols].multiply(df['Deaths'], axis=0)).dropna(axis=0, how="all")
plot_area(df=death_variants,
title='Deaths by Major Variant - Interpolated from Sampling - Thailand',
png_prefix='deaths_by_variants', cols_subset=cols,
ma_days=7,
kind='area', stacked=True, percent_fig=True,
cmap='tab10',
last_update=last_update,
footnote=footnote.format("Deaths"),
footnote_left=foot_source)
hosp_variants = (variants[cols].multiply(df['Hospitalized Respirator'].interpolate(), axis=0)).dropna(axis=0, how="all")
plot_area(df=hosp_variants,
title='Hospitalized on Ventilator by Major Variant - Thailand',
png_prefix='hosp_by_variants', cols_subset=cols,
ma_days=7,
periods_to_plot=["3", "all"],
kind='area', stacked=True, percent_fig=True,
cmap='tab10',
last_update=last_update,
footnote=footnote.format("Hospitalized on Ventilator"),
footnote_left=foot_source)
def save_tests_plots(df: pd.DataFrame) -> None:
# # matplotlib global settings
# matplotlib.use('AGG')
# plt.style.use('dark_background')
# # create directory if it does not exists
# pathlib.Path('./outputs').mkdir(parents=True, exist_ok=True)
# Computed data
# TODO: has a problem if we have local transmission but no proactive
# TODO: put somewhere else
walkins = pd.DataFrame(df["Cases Local Transmission"] - df["Cases Proactive"], columns=['Cases Walkin'])
# In case XLS is not updated before the pptx
df = df.combine_first(walkins).combine_first(df[['Tests',
'Pos']].rename(columns=dict(Tests="Tests XLS", Pos="Pos XLS")))
dash = import_csv("moph_dashboard", ["Date"], False, dir="inputs/json")
df['ATK+'] = dash['Infections Non-Hospital Cum'].cumsum().interpolate(limit_area="inside").diff()
cols = [
'Tests XLS',
'Tests ATK',
'Tests Public',
'Tested PUI',
'Tested PUI Walkin Public',
'Tests ATK Proactive',
]
legends = [
'PCR Tests',
'ATK Tests (DMSC)',
'PCR Tests (Public Hospitals)',
'Persons Under Investigation (PUI)',
'Persons Under Investigation (Public Hospitals)',
'ATK Tests (NHSO provided)',
]
plot_area(df=df,
title='PCR Tests and PUI - Thailand',
legends=legends,
png_prefix='tests', cols_subset=cols,
ma_days=7,
kind='line', stacked=False, percent_fig=False,
cmap='tab10',
# actuals=['Tests XLS'],
footnote='Note: PCR tests likely higher than shown ( due to cases > PCR Positives)\n'
'PCR: Polymerase Chain Reaction\n'
'PUI: Person Under Investigation\n'
'Proactive: Testing done at high risk locations, rather than random sampling.',
footnote_left=f'{source}Data Sources: Daily Situation Reports\n DMSC: Thailand Laboratory Testing Data')
cols = [
'Tested Cum',
'Tested PUI Cum',
'Tested Proactive Cum',
'Tested Quarantine Cum',
'Tested PUI Walkin Private Cum',
'Tested PUI Walkin Public Cum',
]
legends = [
'People Checked',
'Person Under Investigation (PUI)',
'PUI Proactive',
'PUI Quarantine',
'PUI Walk-in (Private Hospital)',
'PUI Walk-in (Public Hospital)',
]
plot_area(df=df,
title='People Under Investigation (PUI) - Cumulative - Thailand',
legends=legends,
png_prefix='tested_pui', cols_subset=cols,
ma_days=7,
kind='line', stacked=False, percent_fig=False,
cmap='tab10',
footnote='Note: Excludes some proactive tests.\n'
'PCR: Polymerase Chain Reaction\n'
'PUI: Person Under Investigation\n'
'Proactive: Testing done at high risk locations, rather than random sampling.',
footnote_left=f'{source}Data Sources: Daily Situation Reports\n DMSC: Thailand Laboratory Testing Data')
# kind of dodgy since ATK is subset of positives but we don't know total ATK
cols = [
'Cases',
'Cases Proactive',
'Tests XLS',
'Tests ATK Proactive',
]
legends = [
"Cases from PCR Tests",
"Cases from Proactive PCR Tests",
"PCR Tests",
"ATK Tests (NHSO provided)",
]
peaks = df[cols] / df.rolling(7).mean().max(axis=0) * 100
plot_area(df=peaks,
title='Tests as % of Peak - Thailand',
png_prefix='tests_peak', cols_subset=cols, legends=legends,
ma_days=7,
kind='line', stacked=False, percent_fig=False, clean_end=True,
cmap='tab10',
y_formatter=perc_format,
footnote='ATK: Covid-19 Rapid Antigen Self Test Kit\n'
'Proactive: Testing done at high risk locations, rather than random sampling.',
footnote_left='Data Source: MOPH Covid-19 Dashboard, CCSA Daily Briefing')
###############
# Positive Rate
###############
df["Positivity PUI"] = df["Cases"].divide(df["Tested PUI"]) * 100
df["Positivity Public"] = df["Pos Public"] / df["Tests Public"] * 100
# there is some weird spikes in tests and pos that throw out this measure. seems like they dumped extra data on certain
# days. can avg it to try and remove it but better to just remove the outliers
# roll = df["Tests XLS"].rolling(7)
# devs = (df["Tests XLS"] - roll.mean()) / roll.std()
# tests_cleaned = df["Tests XLS"][devs < 1.9]
# Fix spikes in cases that didn't use to be there
cleaned_cases = df.loc[:, 'Cases']
cleaned_cases.loc["2022-10-02":"2022-10-08"] = np.nan
cleaned_cases.loc["2022-10-30":"2022-11-05"] = np.nan
df["Positivity Cases/Tests"] = (cleaned_cases / df["Tests XLS"]) * 100
df["Positivity Public+Private"] = (df["Pos XLS"] / df["Tests XLS"] * 100)
df['Positivity Walkins/PUI3'] = df['Cases Walkin'].divide(df['Tested PUI']) / 3.0 * 100
df['Positive Rate Private'] = (df['Pos Private'] / df['Tests Private']) * 100
df['Cases per PUI3'] = df['Cases'].divide(df['Tested PUI']) / 3.0 * 100
df['Cases per Tests'] = df['Cases'] / df['Tests XLS'] * 100
df['Positive Rate ATK Proactive'] = df['Pos ATK Proactive'] / df['Tests ATK Proactive'] * 100
df['Positive Rate ATK'] = df['Pos ATK'] / df['Tests ATK'] * 100
df['Positive Rate PCR + ATK'] = (df['Pos XLS'] + df['Pos ATK']) / (df['Tests XLS'] + df['Tests ATK']) * 100
df['Positive Rate Dash %'] = df['Positive Rate Dash'] * 100
ihme = ihme_dataset(check=False)
df['infection_detection'] = ihme['infection_detection'] * 100
cols = [
'Positivity Public+Private',
'Positive Rate ATK',
'Positive Rate PCR + ATK',
'Positivity Cases/Tests',
# 'Cases per PUI3',
# 'Positivity Walkins/PUI3',
'Positive Rate ATK Proactive',
'Positive Rate Dash %',
'infection_detection',
]
legends = [
'Positive Results per PCR Test (Positive Rate)',
'Positive Results per ATK Test (Positive Rate)',
'Positive Results per Test (PCR + ATK)',
'Confirmed Cases per PCR Test',
# 'Confirmed Cases per PUI*3',
# 'Walkin Cases per PUI*3',
'Positive Results per ATK Test (NHSO provided)',
'Positive Rate from DDC Dashboard',
'Estimated Cases per Infection (IHME detection rate)',
]
plot_area(df=df,
title='Positive Rate - Thailand',
legends=legends,
highlight=['Positivity Public+Private'],
png_prefix='positivity', cols_subset=cols,
ma_days=7,
kind='line', stacked=False, percent_fig=False,
cmap='tab10',
y_formatter=perc_format,
footnote='While PCR test data is missing, Cases per Test might be a better estimate of Positive Rate\n'
'WHO recommends < 5% *assuming tests are > 7k per day over 2 weeks\n'
'NHSO provided ATK go to "high risk" areas so should show higher than normal positive rate',
footnote_left=f'\n{source}Data Sources: DMSC Test Reports, DDC Dashboard, IHME')
df['PUI per Case'] = df['Tested PUI'].divide(df['Cases'])
df['PUI3 per Case'] = df['Tested PUI'] * 3 / df['Cases']
df['PUI3 per Walkin'] = df['Tested PUI'] * 3 / df['Cases Walkin']
df['PUI per Walkin'] = df['Tested PUI'].divide(df['Cases Walkin'])
df['Tests per case'] = df['Tests XLS'] / df['Cases']
df['Tests per positive'] = df['Tests XLS'] / df['Pos XLS']
cols = [
'Tests per positive',
'Tests per case',
'PUI per Case',
'PUI3 per Case',
'PUI per Walkin',
]
legends = [
'PCR Tests per Positive Result',
'PCR Tests per Case',
'PUI per Case',
'PUI*3 per Case',
'PUI per Walk-in Case',
]
# # Gets too big. takes forever
# plot_area(df=df,
# title='Tests per Confirmed Covid Cases - Thailand',
# legends=legends,
# png_prefix='tests_per_case', cols_subset=cols,
# ma_days=7,
# kind='line', stacked=False, percent_fig=False,
# cmap='tab10',
# footnote='\nPUI: Person Under Investigation\n'
# 'PCR: Polymerase Chain Reaction\n'
# 'Note: Walkin Cases/3xPUI seems to give an estimate of positive rate (when cases are high),\n'
# 'so it is included for when testing data is delayed. It is not the actual positive rate.',
# footnote_left=f'\n{source}Data Sources: Daily Situation Reports\n DMSC: Thailand Laboratory Testing Data')
cols = [
'Positivity Cases/Tests',
'Positivity Public',
'Positivity PUI',
'Positive Rate Private',
'Positivity Public+Private',
]
legends = [
'Confirmed Cases per PCR Test (Public Hospital)',
'Positive Results per PCR Test (Private Hospital)',
'Confirmed Cases per PUI',
'Positive Results per PCR Test (Private Hospital)',
'Positive Results per PCR Test',
]
plot_area(df=df,
title='Positive Rate - Thailand',
legends=legends,
png_prefix='positivity_all', cols_subset=cols,
ma_days=7,
kind='line', stacked=False, percent_fig=False,
cmap='tab10',
footnote='\nPUI: Person Under Investigation\n'
+ 'Positivity Rate: The percentage of COVID-19 tests that come back positive.',
footnote_left=f'\n{source}Data Sources: Daily Situation Reports\n DMSC: Thailand Laboratory Testing Data')
########################
# Public vs Private
########################
df['Tests Private Ratio'] = (df['Tests Private'] / df['Tests Public']).rolling('7d').mean()
df['Tests Positive Private Ratio'] = (df['Pos Private'] / df['Pos Public']).rolling('7d').mean()
df['Positive Rate Private Ratio'] = (df['Pos Private'] / (df['Tests Private'])
/ (df['Pos Public'] / df['Tests Public'])).rolling('7d').mean()
df['PUI Private Ratio'] = (df['Tested PUI Walkin Private'] / df['Tested PUI Walkin Public']).rolling('7d').mean()
cols = [
'Tests Private Ratio',
'Tests Positive Private Ratio',
'PUI Private Ratio',
'Positive Rate Private Ratio',
]
plot_area(df=df,
title='Testing Private Ratio - Thailand',
png_prefix='tests_private_ratio', cols_subset=cols,
ma_days=7,
kind='line', stacked=False, percent_fig=False,
cmap='tab10',
footnote='\nPUI: Person Under Investigation\n'
+ 'Positivity Rate: The percentage of COVID-19 tests that come back positive.',
footnote_left=f'\n{source}Data Sources: Daily Situation Reports\n DMSC: Thailand Laboratory Testing Data')
##################
# Test Plots
##################
df["Cases outside Prison"] = df["Cases Local Transmission"].sub(df["Cases Area Prison"], fill_value=0)
cols = [
'Cases',
'Cases Walkin',
'Pos XLS',
'Pos ATK',
# 'ATK+',
# 'Pos Public',
'ATK',
'Pos ATK Proactive',
]
legends = [
'Confirmed Cases',
'Confirmed Walk-in Cases',
'Positive PCR Test Results',
'Positive ATK Test Results (DMSC)',
# 'ATK+ (DDC Dash)',
# 'Positive PCR Test Results (Public)',
'Registered ATK Probable Case (Home Isolation)',
'Positive Proactive ATK Test Results (NHSO provided)',
]
plot_area(df=df,
title='Positive Test Results vs. Confirmed Covid Cases - Thailand',
legends=legends,
png_prefix='cases', cols_subset=cols,
# actuals=["Cases", "Pos XLS"],
ma_days=7,
kind='line', stacked=False, percent_fig=False,
cmap="tab10",
footnote='ATK: Covid-19 Rapid Antigen Self Test Kit\n'
'Cases higher than PCR positive tests likely due to missing PCR test data',
footnote_left=f'{source}Data Sources: Daily Situation Reports\n DMSC: Thailand Laboratory Testing Data')
cols = [
'Cases',
'Cases outside Prison',
'Cases Walkin',
'Pos XLS',
]
legends = [
'Confirmed Cases',
'Confirmed Cases excl. Prison Cases',
'Confirmed Cases excl. Proactive Cases',
'Positive PCR Test Results',
]
plot_area(df=df,
title='Covid Cases vs. Positive Tests - Thailand',
legends=legends,
png_prefix='cases_tests', cols_subset=cols,
ma_days=21,
kind='line', stacked=False, percent_fig=False,
cmap="tab10",
footnote='Proactive: Testing done at high risk locations, rather than random sampling.\n'
'Cases higher than PCR positive tests likely due to missing PCR test data',
footnote_left=f'{source}Data Sources: Daily Situation Reports\n DMSC: Thailand Laboratory Testing Data')
df['Cases 3rd Cum'] = df['2021-04-01':]['Cases'].cumsum()
df['Cases outside Prison 3rd Cum'] = df['2021-04-01':]['Cases outside Prison'].cumsum()
df['Cases Walkin 3rd Cum'] = df['2021-04-01':]['Cases Walkin'].cumsum()
df['Pos XLS 3rd Cum'] = df['2021-04-01':]['Pos XLS'].cumsum()
cols = [
'Cases 3rd Cum',
'Cases outside Prison 3rd Cum',
'Cases Walkin 3rd Cum',
'Pos XLS 3rd Cum',
]
legends = [
'Confirmed Cases',
'Confirmed Cases excl. Prison Cases',
'Confirmed Cases excl. Proactive Cases',
'Positive PCR Test Results',
]
plot_area(df=df,
title='3rd Wave Cumulative Covid Cases and Positive Tests - Thailand',
legends=legends,
png_prefix='cases_tests_cum3', cols_subset=cols,
ma_days=None,
kind='line', stacked=False, percent_fig=False,
cmap="tab10",
footnote='Proactive: Testing done at high risk locations, rather than random sampling.',
footnote_left=f'{source}Data Sources: Daily Situation Reports\n DMSC: Thailand Laboratory Testing Data')
cols = [
'Cases',
'Pos Area',
'Pos XLS',
'Pos Public',
'Pos Private',
'Pos',
]
legends = [
'Cases',
'Positive PCR Test Results (Health Districts Combined)',
'Positive PCR Test Results',
'Positive PCR Test Results (Public Hospitals)',
'Positive PCR Test Results (Private Hospitals)',
'Positive Test Results',
]
plot_area(df=df,
title='Positive Test Results vs. Confirmed Covid Cases - Thailand',
legends=legends,
png_prefix='cases_all', cols_subset=cols,
ma_days=7,
kind='line', stacked=False, percent_fig=False,
cmap='tab20',
footnote_left=f'{source}Data Sources: Daily Situation Reports\n DMSC: Thailand Laboratory Testing Data')
##########################
# Tests by area
##########################
def save_test_area_plots(df):
plt.rc('legend', **{'fontsize': 12})
# by_area = import_csv("tests_by_area", index=["Start"], date_cols=["Start", "End"]).drop(columns=["End"])
# # Works up until 2021-04-11. before this dates are offset?
# by_area = by_area.reindex(pd.date_range(by_area.index.min(), by_area.index.max(), freq='W'))
# # .interpolate(limit_area="inside")
by_area = import_csv("tests_by_area", index=["End"], date_cols=["Start", "End"])
by_area_d = by_area.drop(columns=["Start"]).div((by_area.index - by_area["Start"]).dt.days, axis=0)
# TODO: since it's daily mean, should move to the center of teh week?
by_area_d = by_area_d.reindex(pd.date_range(by_area_d.index.min(), by_area_d.index.max(),
freq='D')).interpolate(limit_area="inside")
cols = rearrange([f'Tests Area {area}' for area in DISTRICT_RANGE_SIMPLE], *FIRST_AREAS)
plot_area(df=by_area_d,
title='PCR Tests by Health District - Thailand',
legends=AREA_LEGEND_SIMPLE,
png_prefix='tests_area', cols_subset=cols,
ma_days=None,
kind='area', stacked=True, percent_fig=False,
cmap='tab20',
footnote='Note: Excludes some proactive and private tests (non-PCR) so actual tests is higher.\n'
+ 'Proactive: Testing done at high risk locations, rather than random sampling.\n'
+ 'PCR: Polymerase Chain Reaction',
footnote_left=f'{source}Data Source: DMSC: Thailand Laboratory Testing Data')
cols = rearrange([f'Pos Area {area}' for area in DISTRICT_RANGE_SIMPLE], *FIRST_AREAS)
plot_area(df=by_area_d,
title='PCR Positive Test Results by Health District - Thailand',
legends=AREA_LEGEND_SIMPLE,
png_prefix='pos_area', cols_subset=cols,
ma_days=None,
kind='area', stacked=True, percent_fig=False,
cmap='tab20',
footnote='Note: Excludes some proactive and private tests (non-PCR) so actual tests is higher.\n'
+ 'Proactive: Testing done at high risk locations, rather than random sampling.\n'
+ 'PCR: Polymerase Chain Reaction',
footnote_left=f'{source}Data Source: DMSC: Thailand Laboratory Testing Data')
# for area in DISTRICT_RANGE_SIMPLE:
# df[f'Tests Area {area} (i)'] = df[f'Tests Area {area}'].interpolate(limit_area="inside")
test_cols = [f'Tests Area {area}' for area in DISTRICT_RANGE_SIMPLE]
for area in DISTRICT_RANGE_SIMPLE:
df[f'Tests Daily {area}'] = (by_area_d[f'Tests Area {area}'] / by_area_d[test_cols].sum(axis=1) * df['Tests XLS'])
cols = rearrange([f'Tests Daily {area}' for area in DISTRICT_RANGE_SIMPLE], *FIRST_AREAS)
plot_area(df=df,
title='PCR Tests by Health District - Thailand',
legends=AREA_LEGEND_SIMPLE,
png_prefix='tests_area_daily', cols_subset=cols,
ma_days=7,
kind='line', stacked=False, percent_fig=False,
cmap='tab20',
footnote='Note: Excludes some proactive and private tests (non-PCR) so actual tests is higher.\n'
+ 'Proactive: Testing done at high risk locations, rather than random sampling.\n'
+ 'PCR: Polymerase Chain Reaction',
footnote_left=f'{source}Data Source: DMSC: Thailand Laboratory Testing Data')
# for area in DISTRICT_RANGE_SIMPLE:
# df[f'Pos Area {area} (i)'] = df[f'Pos Area {area}'].interpolate(limit_area="inside")
pos_cols = [f'Pos Area {area}' for area in DISTRICT_RANGE_SIMPLE]
for area in DISTRICT_RANGE_SIMPLE:
df[f'Pos Daily {area}'] = (by_area_d[f'Pos Area {area}'] / by_area_d[pos_cols].sum(axis=1) * df['Pos XLS'])
cols = rearrange([f'Pos Daily {area}' for area in DISTRICT_RANGE_SIMPLE], *FIRST_AREAS)
plot_area(df=df,
title='Positive PCR Tests by Health District - Thailand',
legends=AREA_LEGEND_SIMPLE,
png_prefix='pos_area_daily', cols_subset=cols,
ma_days=7,
kind='area', stacked=True, percent_fig=False,
cmap='tab20',
footnote='Note: Excludes some proactive and private tests (non-PCR) so actual tests is higher.\n'
+ 'Proactive: Testing done at high risk locations, rather than random sampling.\n'
+ 'PCR: Polymerase Chain Reaction',
footnote_left=f'{source}Data Source: DMSC: Thailand Laboratory Testing Data')
# Workout positivity for each area as proportion of positivity for that period
for area in DISTRICT_RANGE_SIMPLE:
df[f'Positivity {area}'] = (
by_area_d[f'Pos Area {area}'] / by_area_d[f'Tests Area {area}'] * 100
)
cols = [f'Positivity {area}' for area in DISTRICT_RANGE_SIMPLE]
plot_area(df=df,
title='Positive Rate by Health District - Thailand',
legends=AREA_LEGEND_SIMPLE,
png_prefix='positivity_area', cols_subset=rearrange(cols, *FIRST_AREAS),
ma_days=7,
kind='line', stacked=True, percent_fig=False,
cmap='tab20',
y_formatter=perc_format,
footnote='PCR: Polymerase Chain Reaction\n'
+ 'Positivity Rate: The percentage of COVID-19 tests that come back positive.\n'
+ 'Note: Excludes some proactive and private tests (non-PCR) so actual tests is higher.\n'
+ 'Proactive: Testing done at high risk locations, rather than random sampling.',
footnote_left=f'{source}Data Source: DMSC: Thailand Laboratory Testing Data')
df['Total Positivity Area'] = df[cols].sum(axis=1)
for area in DISTRICT_RANGE_SIMPLE:
df[f'Positivity {area}'] = (df[f'Positivity {area}'] / df['Total Positivity Area']
* (df["Pos XLS"] / df["Tests XLS"] * 100))
plot_area(df=df,
title='Positive Rate by Health District - Thailand',
legends=AREA_LEGEND_SIMPLE,
png_prefix='positivity_area_stacked', cols_subset=rearrange(cols, *FIRST_AREAS),
ma_days=7,
kind='area', stacked=True, percent_fig=False,
cmap='tab20',
y_formatter=perc_format,
footnote='PCR: Polymerase Chain Reaction\n'
+ 'Positivity Rate: The percentage of COVID-19 tests that come back positive.\n'
+ 'Note: Excludes some proactive and private tests (non-PCR) so actual tests is higher.\n'
+ 'Proactive: Testing done at high risk locations, rather than random sampling.',
footnote_left=f'{source}Data Source: DMSC: Thailand Laboratory Testing Data')
dash_prov = import_csv("moph_dashboard_prov", ["Date", "Province"], dir="inputs/json")
# TODO: 0 maybe because no test data on that day? Does median make sense?
dash_prov["Positive Rate Dash"] = dash_prov["Positive Rate Dash"].replace({0.0: np.nan})
# for area in DISTRICT_RANGE_SIMPLE:
# df[f'Positivity Daily {area}'] = df[f'Pos Daily {area}'] / df[f'Tests Daily {area}'] * 100
# cols = [f'Positivity Daily {area}' for area in DISTRICT_RANGE_SIMPLE]
pos_areas = join_provinces(dash_prov, "Province", ["Health District Number", "region"])
pos_areas = area_crosstab(pos_areas, "Positive Rate Dash", aggfunc="mean") * 100
cols = rearrange([f'Positive Rate Dash Area {area}' for area in DISTRICT_RANGE_SIMPLE], *FIRST_AREAS)
topcols = df[cols].sort_values(by=df[cols].last_valid_index(), axis=1, ascending=False).columns[:5]
legends = rearrange(AREA_LEGEND_ORDERED, *[cols.index(c) + 1 for c in topcols])[:5]
plot_area(df=pos_areas,
title='Average Positive Rate - by Health District - Thailand',
legends=legends,
png_prefix='positivity_area_unstacked', cols_subset=topcols,
ma_days=7,
kind='line', stacked=False, percent_fig=False,
cmap='tab10',
y_formatter=perc_format,
footnote='Positivity Rate: The % of COVID-19 tests that come back positive.',
footnote_left=f'{source}Data Source: MOPH Covid-19 Dashboard')
pos_areas = join_provinces(dash_prov, "Province", ["Health District Number", "region"]).reset_index()
pos_areas = pd.crosstab(pos_areas['Date'], pos_areas['region'],
values=pos_areas["Positive Rate Dash"], aggfunc="mean") * 100
tests_by_province = import_csv("tests_by_province", index=["Date", "Province"])
pos_prov = tests_by_province[[c for c in tests_by_province.columns if ' Pos' in c]].sum(
axis=1) / tests_by_province[[c for c in tests_by_province.columns if ' Tests' in c]].sum(axis=1)
pos_prov = pos_prov.to_frame("Positive Rate")
pos_prov = join_provinces(pos_prov, "Province", ["Health District Number", "region"]).reset_index()
pos_prov = pd.crosstab(pos_prov['Date'], pos_prov['region'],
values=pos_prov["Positive Rate"], aggfunc="mean") * 100
plot_area(df=pos_areas.combine_first(pos_prov),
title='PCR Positive Rate - Mean per Region - Thailand',
png_prefix='positivity_region', cols_subset=utils_thai.REG_COLS, legends=utils_thai.REG_LEG,
ma_days=21,
kind='line', stacked=False, percent_fig=False, mini_map=True,
cmap=utils_thai.REG_COLOURS,
y_formatter=perc_format,
# TODO: fix table when incomplete data
# table=trend_table(dash_prov["Positive Rate Dash"].dropna() * 100, sensitivity=4, style="green_down", ma_days=21),
footnote='Positivity Rate: The % of COVID-19 tests that come back positive.\nDashboard positive rate differs from testing reports',
footnote_left=f'{source}Data Source: MOPH Covid-19 Dashboard')
top5 = dash_prov.pipe(topprov,
lambda df: df["Positive Rate Dash"] * 100,
name="Province Positive Rate",
other_name=None,
num=5)
cols = top5.columns.to_list()
plot_area(df=top5,
title='Positive Rate - Top Provinces - Thailand',
png_prefix='positivity_prov_top', cols_subset=cols,
ma_days=14,
kind='line', stacked=False, percent_fig=False,
cmap='tab10',
y_formatter=perc_format,
footnote='Positivity Rate: The percentage of COVID-19 tests that come back positive.',
footnote_left=f'{source}Data Source: MOPH Covid-19 Dashboard')
top5 = dash_prov.pipe(topprov,
lambda df: -df["Positive Rate Dash"] * 100,
lambda df: df["Positive Rate Dash"] * 100,
name="Province Positive Rate",
other_name=None,
num=5)
cols = top5.columns.to_list()
plot_area(df=top5,
title='Positive Rate - Lowest Provinces - Thailand',
png_prefix='positivity_prov_low', cols_subset=cols,
ma_days=14,
kind='line', stacked=False, percent_fig=False,
cmap='tab10',
y_formatter=perc_format,
footnote='Positivity Rate: The percentage of COVID-19 tests that come back positive.',
footnote_left=f'{source}Data Source: MOPH Covid-19 Dashboard')
for area in DISTRICT_RANGE_SIMPLE:
df[f'Cases/Tests {area}'] = (
df[f'Cases Area {area}'] / df[f'Tests Area {area}'] * 100
)
cols = [f'Cases/Tests {area}' for area in DISTRICT_RANGE_SIMPLE]
plot_area(df=df,
title='Highest Covid Cases/Tests by Health District - Thailand',
legends=AREA_LEGEND_SIMPLE,
png_prefix='casestests_area_unstacked', cols_subset=rearrange(cols, *FIRST_AREAS),
ma_days=None,
kind='area', stacked=False, percent_fig=False,
cmap='tab20',
footnote='Note: Excludes some proactive and private tests (non-PCR) so actual tests is higher.\n'
+ 'Proactive: Testing done at high risk locations, rather than random sampling.\n'
+ 'PCR: Polymerase Chain Reaction',
footnote_left=f'{source}Data Source: DMSC: Thailand Laboratory Testing Data')
for area in DISTRICT_RANGE_SIMPLE:
df[f'Case-Pos {area}'] = (
df[f'Cases Area {area}'] - df[f'Pos Area {area}']
)
cols = [f'Case-Pos {area}' for area in DISTRICT_RANGE_SIMPLE]
plot_area(df=df,
title='Which Health Districts have more Covid Cases than Positive Results? - Thailand',
legends=AREA_LEGEND_SIMPLE,
png_prefix='cases_from_positives_area', cols_subset=rearrange(cols, *FIRST_AREAS),
ma_days=None,
kind='area', stacked=False, percent_fig=False, limit_to_zero=False,
cmap='tab20',
footnote_left=f'{source}Data Source: CCSA Daily Briefing')
if __name__ == "__main__":
df = import_csv("combined", index=["Date"])
os.environ["MAX_DAYS"] = '0'
os.environ['USE_CACHE_DATA'] = 'True'
save_variant_plots(df)
save_test_area_plots(df)
save_tests_plots(df)