4_main_analysis_GW.Rmd

---
title: "Analysis of confidence intervals in journal abstracts and full-text."
author: "Adrian Barnett"
date: "`r format(Sys.time(), '%d %B %Y')`"
output: word_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, warning=FALSE, message=FALSE, error=FALSE, comment='', dpi=400, fig.width=7, fig.height=5)
options(width=1000) # Wide pages
options(scipen=999) # avoid scientific presentation
# libraries
library(extrafont) # for PLOS fonts (Times New Roman) 
loadfonts(device="postscript")
library(stringr) 
library(dplyr)
library(pander)
library(tables)
library(broom)
panderOptions('table.emphasize.rownames', FALSE)
panderOptions('keep.trailing.zeros', TRUE)
panderOptions('table.split.table', Inf)
panderOptions('table.split.cells', Inf)
panderOptions('big.mark', ',')
library(ggplot2)
g.theme = theme_bw() + theme(panel.grid.minor = element_blank())
cbPalette = c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
# function to round with trailing zeros
roundz  = function(x, digits=0){formatC( round( x, digits ), format='f', digits=digits)}
# get the data
load('data/Georgescu.Wren.RData') # from 1_data_from_Georgescu_Wren.R
complete = filter(complete, is.na(lower)==FALSE) # remove small number with missing lower limit
```

# Summary table

The table below shows the number of confidence limits from abstracts and full-texts, together with the total number of unique journals, and the number with a zero lower limit (which are excluded from the plots below).

```{r summary}
to.table = group_by(complete, source) %>%
  summarise(n = n(),
            journals = length(unique(journal)),
            zero = sum(lower==0))
pander(to.table, style='simple')
```

## Table of mistakes

How many intervals were excluded because of a mistake where the mean was not within the interval.

```{r table.mistake}
mtable = tabular(Heading('Mistake')*factor(mistake) + 1~ Heading('')*factor(source)*((n=1) + Percent('col')), data=complete)
pander(mtable, digits=1)
```

## Table of CI level

### Numbers with missing CI level

```{r table.level.missing}
to.table = filter(complete, mistake==FALSE, lower>0) %>%
  mutate(is.95.missing = is.na(ci.level))
mtable = tabular(Heading('Missing')*factor(is.95.missing) + 1~ Heading('')*(factor(source) +1)*((n=1) + Percent('col')), data=to.table)
pander(mtable, digits=1)
```
### Numbers with missing CI level

For the table below we excluded the intervals where the CI level was not given.

```{r table.level}
to.table = filter(complete, mistake==FALSE, lower>0, !is.na(ci.level)) %>%
  mutate(is.95 = ci.level==0.95)
mtable = tabular(Heading('95% CI')*factor(is.95) + 1~ Heading('')*(factor(source)+1)*((n=1) + Percent('col')), data=to.table)
pander(mtable, digits=1)
```


# Lower confidence interval (log-scale)

```{r gw.lower}
# split by CI type - make nicer label (also add numbers?)
rr.lower = filter(complete, lower>0) 
lplot = ggplot(rr.lower, aes(x=lower))+
  geom_vline(xintercept = 1, lty=1, col='black')+
  stat_ecdf(geom = "step", size=1.1)+
  scale_x_log10(breaks=c(0.1,0.5,1,2,10), labels=c(0.1,0.5,1,2,10))+
  xlab('Ratio')+
  ylab('Cumulative distribution')+
  coord_cartesian(xlim=c(0.1, 10))+ # limit x-axis to focus on key change
  theme_bw()+
  theme(panel.grid.minor = element_blank())+
  facet_wrap(~source)
lplot
# number and percent excluded from plot by the limits on the x-axis
perc.lower.ex = round(100*sum(rr.lower$lower<0.1)/nrow(rr.lower),1)
perc.upper.ex = round(100*sum(rr.lower$lower>10)/nrow(rr.lower),1)
# export plot
jpeg('figures/lowerIntervalGW.jpg', width=5, height=4, units='in', res=400, quality=100)
print(lplot)
invisible(dev.off())
```

There were `r perc.lower.ex`% limits excluded from the plot that were less than 0.1, and `r perc.upper.ex`% limits excluded that were above 10.

## Number and percent of lower intervals above 1 and less than or equal to 1.1

```{r tab.one}
for.table = group_by(complete, source) %>%
  filter(lower > 0) %>%
  mutate(narrow = lower>1 & lower<=1.1,
         narrowf = factor(as.numeric(narrow), levels=0:1, labels=c('No','Yes')))
tab = tabular(Heading('(1, 1.1]')*narrowf+1 ~ Heading('')*factor(source)*((n=1) + Percent('col')), data=for.table)
pander(tab, digits=0)
```

## Number and percent of lower intervals above 1 and less than or equal to 1.2

```{r tab.one.one}
for.table = group_by(complete, source) %>%
  filter(lower > 0) %>%
  mutate(narrow = lower>1 & lower<=1.2,
         narrowf = factor(as.numeric(narrow), levels=0:1, labels=c('No','Yes')))
tab = tabular(Heading('(1, 1.2]')*narrowf+1 ~ Heading('')*factor(source)*((n=1) + Percent('col')), data=for.table)
pander(tab, digits=0)
```

# Upper confidence interval (log scale)

```{r gw.upper}
uplot = ggplot(complete, aes(x=upper))+
  geom_vline(xintercept = 1, lty=1, col='black')+
  stat_ecdf(geom = "step", size=1.1)+
  scale_x_log10(breaks=c(0.1,0.5,1,2,10), labels=c(0.1,0.5,1,2,10))+
  coord_cartesian(xlim=c(0.5, 2), ylim=c(0,0.6))+ # limit x-axis to focus on key change
  xlab('Ratio')+
  ylab('Cumulative distribution')+
  theme_bw()+  
  theme(panel.grid.minor = element_blank())+
  facet_wrap(~source)
uplot
```

# Plot of lower and upper intervals side by side (log scale)

```{r plot.both.gw}
n.no.year = group_by(complete, source) %>%
  summarise(count.na = sum(is.na(Year))) # number with missing year, used below
lower = filter(complete, lower>0, mistake==FALSE) %>% # exclude mistakes
  mutate(link=1:n(), itype='Lower') %>%
  select(link, Year, itype, lower, source) %>%
  rename('interval'='lower') 
upper = filter(complete, mistake==FALSE) %>% # exclude mistakes
  mutate(link=1:n(), itype='Upper') %>%
  select(link, Year, itype, upper, source) %>%
  rename('interval'='upper') 
both.intervals = bind_rows(lower, upper)
# split by CI type
bplot = ggplot(both.intervals, aes(x=interval))+
  geom_vline(xintercept = 1, lty=1, col='black')+
  stat_ecdf(geom = "step", size=1.1)+
  coord_cartesian(xlim=c(0.1, 10))+ # limit x-axis to focus on key change
  scale_x_log10(breaks=c(0.1,0.5,1,2,10), labels=c(0.1,0.5,1,2,10))+
  xlab('Ratio')+
  ylab('Cumulative distribution')+
  theme_bw()+
  theme(panel.grid.minor = element_blank())+
  facet_grid(itype ~ source)
bplot
# export plot
jpeg('figures/BothIntervalGW.jpg', width=5, height=5, units='in', res=400, quality=100)
print(bplot)
invisible(dev.off())
```

# Alternative plot on same panel (log scale)

```{r plot.both.gw.same.panel}
# split by CI type
bplot = ggplot(both.intervals, aes(x=interval, col=factor(itype)))+
  geom_vline(xintercept = 1, lty=1, col='black')+
  stat_ecdf(geom = "step", size=1.1)+
  coord_cartesian(xlim=c(0.1, 10))+ # limit x-axis to focus on key change
  scale_x_log10(breaks=c(0.1,0.5,1,2,10), labels=c(0.1,0.5,1,2,10))+
  xlab('Ratio')+
  ylab('Cumulative distribution')+
  scale_color_manual('Interval', values=cbPalette[2:3])+
  theme_bw()+
  facet_wrap(~source)+
  theme(panel.grid.minor = element_blank(), legend.position = c(0.1, 0.8))
bplot
# export plot
jpeg('figures/BothIntervalGWColour.jpg', width=5.5, height=4, units='in', res=400, quality=100)
print(bplot)
invisible(dev.off())
# version for journal appendix
postscript('figures/BarnettAppendixFig1.eps', width=5.5, height=4, family='Times New Roman')
print(bplot)
invisible(dev.off())
```

```{r observation.counts}
# How many observations does the above plot include?
stats = mutate(both.intervals, inplot = interval>=0.1 & interval<=10) %>%
  group_by(itype) %>%
  summarise(count = sum(inplot), n=n(), percent=round(100*count/n))
```

The plot above includes `r stats$percent[stats$itype=='Lower']`% of all lower intervals and `r stats$percent[stats$itype=='Upper']`% of all upper intervals.

# Alternative plot on non-log scale

This plot was suggested by a reviewer who said that "log-scales are difficult to process intuitively".
The x-axis is restricted to 0.25 (1/4) to 4.

```{r plot.non.log}
# add numbers to facet labels
# numbers per 5 year period
numbers = group_by(both.intervals, source) %>%
  filter(itype=='Lower') %>% # avoid double-counting
  summarise(n=n())
# add numbers to labels
n = str_replace_all(pattern=' ', replacement='', string=format(numbers$n, big.mark=','))
facet.labels = paste(numbers$source, ' (n = ', n, ')', sep='')
both.intervals = mutate(both.intervals,
                        facet = factor(source, levels=numbers$source, labels=facet.labels)) # add label to data
# plot
non.log.plot = ggplot(both.intervals, aes(x=interval, col=factor(itype)))+
  geom_vline(xintercept = 1, lty=1, col='black')+
  stat_ecdf(geom = "step", size=1.1)+
  coord_cartesian(xlim=c(0.25, 4))+ # limit x-axis to focus on key change
  xlab('Ratio')+
  ylab('Cumulative distribution')+
  scale_color_manual('Interval', values=cbPalette[2:3])+
  theme_bw()+
  facet_wrap(~facet)+
  theme(panel.grid.minor = element_blank(), legend.position = c(0.85, 0.2))
non.log.plot
# export plot
jpeg('figures/BothIntervalGWColourNonLog.jpg', width=5.5, height=4, units='in', res=400, quality=100)
print(non.log.plot)
invisible(dev.off())
# for journal
#postscript('figures/BarnettFig1.eps', width=5.5, height=4, family='Times New Roman')
#print(non.log.plot)
#invisible(dev.off())
# export plot as tiff (change font to Times)
tiff('figures/BarnettFig1.tif', width=5.5, height=4, units='in', res=600, compression = 'lzw', family='Times New Roman')
print(bplot)
invisible(dev.off())
```


```{r}
# How many observations does the above plot include?
stats = mutate(both.intervals, inplot = interval>=0.25 & interval<=4) %>%
  group_by(itype) %>%
  summarise(count = sum(inplot), n=n(), percent=round(100*count/n))
```

The plot above includes `r stats$percent[stats$itype=='Lower']`% of all lower intervals and `r stats$percent[stats$itype=='Upper']`% of all upper intervals.

# Alternative plot on zoomed in scale

This plot was suggested by a reviewer who asked if we could zoom in on the area closest to 1.
The x-axis is restricted to 0.83 to 1.2 and the y-axis is restricted to reduce white space.
We also plot this on a non-log scale.

```{r plot.zoome}
zoom.plot = ggplot(both.intervals, aes(x=interval, col=factor(itype)))+
  geom_vline(xintercept = 1, lty=1, col='black')+
  stat_ecdf(geom = "step", size=1.1)+
  xlab('Ratio')+
  ylab('Cumulative distribution')+
  coord_cartesian(xlim=c(0.833, 1/0.833), ylim=c(0.1,0.7))+ # limit x-axis to focus on key change
  scale_color_manual('Interval', values=cbPalette[2:3])+
  theme_bw()+
  facet_wrap(~source)+
  theme(panel.grid.minor = element_blank(), legend.position = c(0.10, 0.82))
zoom.plot
# export plot
jpeg('figures/BothIntervalGWColourNarrowX.jpg', width=5.5, height=4, units='in', res=400, quality=100)
print(zoom.plot)
invisible(dev.off())
# for journal appendix (just change of name)
postscript('figures/BarnettAppendixFig3.eps', width=5.5, height=4, family='Times New Roman')
print(zoom.plot)
invisible(dev.off())
```

This zoomed in plot shows the clear steps in the data due to rounding.

```{r observation.counts.zoom}
# How many observations does the above plot include?
stats = mutate(both.intervals, inplot = interval>=0.83 & interval<=1.2) %>%
  group_by(itype) %>%
  summarise(count = sum(inplot), n=n(), percent=round(100*count/n))
```

The plot above includes `r stats$percent[stats$itype=='Lower']`% of all lower intervals and `r stats$percent[stats$itype=='Upper']`% of all upper intervals.


# Non-cumulative plots

## Wider bin-width

This plot is in bin-widths of 0.1. The x-axis range is from 0.2 to 4.

```{r plot.both.bar.nonlog}
# create frequencies
to.plot = dplyr::filter(both.intervals,
                 interval > 0.2,
                 interval <= 4) %>%
  mutate(groups = floor(interval/0.1)) %>% # groups are [ to )
  group_by(itype, source, groups) %>%
  summarise(height = n()) %>%
  mutate(x = (groups*0.1) + 0.05) # midpoint
dplot = ggplot(data=to.plot, aes(x=x, y=height, fill=factor(itype)))+
  geom_vline(xintercept = 1, lty=1, col='black')+
  geom_col()+
  facet_grid(itype~source, scales='free_y')+
  xlab('Ratio')+
  ylab('Counts')+
  scale_fill_manual('Interval', values=cbPalette[2:3])+
  theme_bw()+
  theme(panel.grid.minor = element_blank(), legend.position = c(0.85, 0.85))
dplot
# export plot
jpeg('figures/DistBothIntervalGWColourNonLog.jpg', width=5.5, height=4, units='in', res=400, quality=100)
print(dplot)
invisible(dev.off())
# version for journal appendix
postscript('figures/BarnettAppendixFig2.eps', width=5.5, height=4, family='Times New Roman')
print(dplot)
invisible(dev.off())
```

## Narrower bin width

This plot is in bin-widths of 0.05. The x-axis range is from 0.2 to 4.

```{r plot.both.bar.nonlog.narrow.bin}
# create frequencies
to.plot = dplyr::filter(both.intervals,
                 interval > 0.2,
                 interval <= 4) %>%
  mutate(groups = floor(interval/0.05)) %>%
  group_by(itype, source, groups) %>%
  summarise(height = n()) %>%
  mutate(x = (groups*0.05) + 0.025) # midpoint
dplot = ggplot(data=to.plot, aes(x=x, y=height, fill=factor(itype)))+
  geom_vline(xintercept = 1, lty=1, col='black')+
  geom_col()+
  facet_grid(itype~source, scales='free_y')+
  xlab('Ratio')+
  ylab('Counts')+
  scale_fill_manual('Interval', values=cbPalette[2:3])+
  theme_bw()+
  theme(panel.grid.minor = element_blank(), legend.position = c(0.85, 0.85))
dplot
```

# Intervals over time 

We examine the cumulative plots over time to see whether things have worsened or improved.
We plot the empirical cumulative distribution function in five year bands.
We excluded the data prior to 1990 because of the relatively small numbers.

## Abstracts

```{r gw.over.time.abstracts, fig.width=8, fig.height=6}
# group into five year periods
ylabels = paste(seq(1990,2015,5),' to ', seq(1994,2019,5), sep='')
to.plot = filter(both.intervals, interval>0, Year >= 1990, source=='Abstract') %>% # exclude early years with little data
  mutate(five.year = floor((Year-1985)/5))
# numbers per 5 year period
numbers = group_by(to.plot, five.year) %>%
  filter(itype=='Lower') %>%  # avoid double counting
  summarise(n=n(), miny=min(Year), maxy=max(Year))
# add numbers to labels
numbers = str_replace_all(pattern=' ', replacement='', string=format(numbers$n, big.mark=','))
ylabels = paste(ylabels, ' (n = ', numbers, ')', sep='')
# add labels to data
to.plot = mutate(to.plot, five.yearc = factor(five.year, levels=1:6, labels=ylabels))
# split by CI type
lyplot = ggplot(to.plot, aes(x=interval, col=factor(itype)))+
  geom_vline(xintercept = 1, lty=1, col='black')+
  stat_ecdf(geom = "step", size=1.1)+
  scale_x_log10(breaks=c(0.1,1,10), labels=c(0.1,1,10))+
  scale_color_manual('Interval', values=cbPalette[2:3])+
  xlab('Ratio')+
  ylab('Cumulative distribution')+
  coord_cartesian(xlim=c(0.1, 10))+ # limit x-axis to focus on key change
  theme_bw()+
  theme(panel.grid.minor = element_blank())+
  facet_wrap(~five.yearc)
lyplot
# export plot
jpeg('figures/IntervalsTimeGWAbstract.jpg', width=6.5, height=5, units='in', res=400, quality=100)
print(lyplot)
invisible(dev.off())
```

The plot shows a near identical cumulative distribution functions for each five year period.

This plot excludes `r n.no.year$count.na[n.no.year$source=='Abstract']` intervals that were missing the year.

### Abstract intervals over time on non-log scale

Here we show the same plot but on a non-log scale restricted to the ratios of 0.25 to 4.

```{r gw.over.time.abstracts.nonlog, fig.width=8, fig.height=6}
# split by CI type
lyplot.nonlog = ggplot(to.plot, aes(x=interval, col=factor(itype)))+
  geom_vline(xintercept = 1, lty=1, col='black')+
  stat_ecdf(geom = "step", size=1.1)+
  scale_color_manual('Interval', values=cbPalette[2:3])+
  xlab('Ratio')+
  ylab('Cumulative distribution')+
  coord_cartesian(xlim=c(0.25, 4))+ # limit x-axis to focus on key change
  theme_bw()+
  theme(panel.grid.minor = element_blank())+
  facet_wrap(~five.yearc)
lyplot.nonlog
# export plot
jpeg('figures/IntervalsTimeGWNonLogAbstract.jpg', width=6.5, height=5, units='in', res=400, quality=100)
print(lyplot.nonlog)
invisible(dev.off())
# for journal
#postscript('figures/BarnettFig3.eps', width=6.5, height=5, family='Times New Roman')
#print(lyplot.nonlog)
#invisible(dev.off())
# export plot as tiff (change font to Times)
tiff('figures/BarnettFig3.tif', width=6.5, height=5, units='in', res=600, compression = 'lzw', family='Times New Roman')
print(lyplot.nonlog)
invisible(dev.off())
```

## Full-text

For the full-text data we start from the year 2000 because there are few papers available before then.

```{r gw.over.time.fulltext, fig.width=8, fig.height=6}
# group into five year periods
ylabels = paste(seq(2000,2015,5),' to ', seq(2004,2019,5), sep='')
to.plot = filter(both.intervals, interval>0, Year >= 2000, source=='Full-text') %>% # exclude early years (from 2000) with little data
  mutate(five.year = floor((Year-1995)/5))
# numbers per 5 year period
numbers = group_by(to.plot, five.year) %>%
  filter(itype=='Lower') %>%  # avoid double counting
  summarise(n=n(), miny=min(Year), maxy=max(Year))
# add numbers to labels
numbers = str_replace_all(pattern=' ', replacement='', string=format(numbers$n, big.mark=','))
ylabels = paste(ylabels, ' (n = ', numbers, ')', sep='')
# add labels to data
to.plot = mutate(to.plot, five.yearc = factor(five.year, levels=1:4, labels=ylabels))
# split by CI type
lyplot = ggplot(to.plot, aes(x=interval, col=factor(itype)))+
  geom_vline(xintercept = 1, lty=1, col='black')+
  stat_ecdf(geom = "step", size=1.1)+
  scale_x_log10(breaks=c(0.1,1,10), labels=c(0.1,1,10))+
  scale_color_manual('Interval', values=cbPalette[2:3])+
  xlab('Ratio')+
  ylab('Cumulative distribution')+
  coord_cartesian(xlim=c(0.1, 10))+ # limit x-axis to focus on key change
  theme_bw()+
  theme(panel.grid.minor = element_blank())+
  facet_wrap(~five.yearc)
lyplot
# export plot
jpeg('figures/IntervalsTimeGWFullText.jpg', width=6.5, height=5, units='in', res=400, quality=100)
print(lyplot)
invisible(dev.off())
```

The plot shows a near identical cumulative distribution functions for each five year period.

This plot excludes `r n.no.year$count.na[n.no.year$source=='Full-text']` intervals that were missing the year.

### Full-text intervals over time on non-log scale

Here we show the same plot but on a non-log scale restricted to the ratios of 0.25 to 4.

```{r gw.over.time, fig.width=8, fig.height=6}
# split by CI type
lyplot.nonlog = ggplot(to.plot, aes(x=interval, col=factor(itype)))+
  geom_vline(xintercept = 1, lty=1, col='black')+
  stat_ecdf(geom = "step", size=1.1)+
  scale_color_manual('Interval', values=cbPalette[2:3])+
  xlab('Ratio')+
  ylab('Cumulative distribution')+
  coord_cartesian(xlim=c(0.25, 4))+ # limit x-axis to focus on key change
  theme_bw()+
  theme(panel.grid.minor = element_blank())+
  facet_wrap(~five.yearc)
lyplot.nonlog
# export plot
jpeg('figures/IntervalsTimeGWNonLogFulltext.jpg', width=6.5, height=5, units='in', res=400, quality=100)
print(lyplot.nonlog)
invisible(dev.off())
# for journal
postscript('figures/BarnettAppendixFig4.eps', width=6.5, height=5, family='Times New Roman')
print(lyplot.nonlog)
invisible(dev.off())
```

The plot shows a near identical cumulative distribution function for each five year period.
The plot also shows more rounding in earlier abstracts as the more recent cumulative plots are smoother.

# Sensitivity analysis using one result per abstract

The above results make no adjustment for using repeated data from the same abstract. Here we use a sensitivity analysis to check whehter this dependence is influencing the results. We do this by randomly selecting one interval per abstract and re-creaing the plot.

```{r sensitivity}
# this takes a while
for.plot = NULL
for (k in 1:10){ # repeat random selection 10 times
sens = filter(complete, mistake==FALSE, lower>0) %>% # exclude mistakes
  select(-journal, -Year, -mistake) %>% # tidy-up a bit
  group_by(pubmed) %>%
  mutate(rand = runif(n())) %>% # random number ...
  arrange(pubmed, rand) %>% # ... now sort by the random number and select the first result
  mutate(order=1:n())%>%
  ungroup() %>%
  filter(order==1) %>% # just one result per abstract
  select(-order) %>%
  mutate(k=k)
# now split into the lower and upper interval
lower = mutate(sens, itype='Lower') %>%
  select(k, itype, lower) %>%
  rename('interval'='lower') 
upper = mutate(sens, itype='Upper') %>%
  select(k, itype, upper) %>%
  rename('interval'='upper') 
for.plot = bind_rows(for.plot, lower, upper)
}
# plot the results
splot = ggplot(for.plot, aes(x=interval, lty=factor(k), col=factor(k)))+
  geom_vline(xintercept = 1, lty=1, col='black')+
  stat_ecdf(geom = "step", size=1.1)+
  coord_cartesian(xlim=c(0.1, 10))+ # limit x-axis to focus on key change
  scale_x_log10(breaks=c(0.1,0.5,1,2,10), labels=c(0.1,0.5,1,2,10))+
  xlab('Ratio')+
  ylab('Cumulative distribution')+
  theme_bw()+
  theme(panel.grid.minor = element_blank(), legend.position = 'none')+
  facet_wrap( ~ itype)
splot
```

The lines were completely overlapping, showing no impact of the repeated results.

# Unbiased data from Schuemie et al

```{r load.unbiased, include=FALSE}
# data from Figshare https://figshare.com/articles/Data_file_S1_from_Improving_reproducibility_by_using_high-throughput_observational_studies_with_empirical_calibration/6684080
unbiased = read.csv('data/rsta20170356_si_002.csv', stringsAsFactors = FALSE)
unbiased = dplyr::select(unbiased, True.hazard.ratio,  Hazard.ratio, Lower.bound.95..CI, Upper.bound.95..CI) %>%
  filter(True.hazard.ratio <=1.5) %>% # just the negative and one example of a positive association
  mutate(group = ifelse(True.hazard.ratio==1, 'Negative', 'Positive'))
```

Here we examine the data from Schuemie et al which is a large unbiased sample of hazard ratios from an observational analysis of insurance data.
We use `r format(nrow(unbiased),big.mark=',')` hazard ratios.

```{r unbiased.plot}
# add sample size to label
tab = table(unbiased$group)
labels = paste(names(tab), ' (n = ', format(as.numeric(tab), big.mark=','), ')', sep='')
unbiased =mutate(unbiased, group.nice = factor(group, levels=c('Negative', 'Positive'), labels=labels))
# split into the lower and upper interval
lower = mutate(unbiased, itype='Lower') %>%
  filter(is.na(Lower.bound.95..CI)==FALSE) %>%
  select(itype, True.hazard.ratio, Lower.bound.95..CI, group.nice) %>%
  rename('interval'='Lower.bound.95..CI') 
upper = mutate(unbiased, itype='Upper') %>%
  filter(is.na(Upper.bound.95..CI)==FALSE) %>%
  select(itype, True.hazard.ratio, Upper.bound.95..CI, group.nice) %>%
  rename('interval'='Upper.bound.95..CI') 
for.plot = bind_rows(lower, upper)
# plot
uplot = ggplot(for.plot, aes(x=interval, col=itype))+
  geom_vline(xintercept = 1, lty=1, col='black')+
  stat_ecdf(geom = "step", size=1.1)+
  # non-log version
  coord_cartesian(xlim=c(0.25, 4))+ # limit x-axis to focus on key change
  # log version
#  coord_cartesian(xlim=c(0.1, 10))+ # limit x-axis to focus on key change
#  scale_x_log10(breaks=c(0.1,0.5,1,2,10), labels=c(0.1,0.5,1,2,10))+
  scale_color_manual('Interval', values=cbPalette[2:3])+
  xlab('Ratio')+
  ylab('Cumulative distribution')+
  theme_bw()+
  theme(panel.grid.minor = element_blank(), legend.position = c(0.85,0.2))+
  facet_wrap( ~ group.nice)
uplot
# export plot
jpeg('figures/IntervalsSchuemie.jpg', width=5, height=3.5, units='in', res=400, quality=100)
print(uplot)
invisible(dev.off())
# export for journal (change name)
#postscript('figures/BarnettFig2.eps', width=5, height=3.5, family='Times New Roman')
#print(uplot)
#invisible(dev.off())
tiff('figures/BarnettFig2.tif', width=5, height=3.5, units='in', res=600, compression = 'lzw', family='Times New Roman')
print(uplot)
invisible(dev.off())
```