Skip to content

Commit

Permalink
#17 dedupe cfpb db
Browse files Browse the repository at this point in the history
  • Loading branch information
judgelord committed Mar 24, 2021
1 parent b25ddf7 commit 1421392
Showing 1 changed file with 27 additions and 0 deletions.
27 changes: 27 additions & 0 deletions code/sql_cfpb_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,33 @@ unmatched %>%
# 9 NA CFPB-2013-0035
# 10 NA CFPB-2017-0027

############################

#FIXME this will not be necessary with the next version of devin's db
comments_cfpb_df %<>%
mutate(comment_url = comment_url %>% str_replace("document?D=", "document/"))

# Refining for uniqueness
d <- comments_cfpb_df

names(d)

d %<>% select(-late_comment)

d %<>% distinct() %>% arrange(comment_url)

# oddly, some comments have more than one posted date; maybe they were updated between my scrapes?
d %<>% group_by(comment_url) %>%
slice_max(posted_date)

# check for duplicate urls (primary key to attachments table)
d %>%
add_count(comment_url, sort = T) %>%
filter(n >1) %>%
arrange(comment_url)


comments_cfpb_df <- d


##########################
Expand Down Expand Up @@ -287,6 +313,7 @@ actions_cfpb %>%




names(comments_cfpb_df)
# N
comments_cfpb_df %>%
Expand Down

0 comments on commit 1421392

Please sign in to comment.