process_data.py

# import libraries
import sys

import pandas as pd
from sqlalchemy import create_engine


def load_data(messages_filepath, categories_filepath):
    """
    Load messages and categories from csv files into a pandas DF.

    INPUT:
        messages_filepath - path to location of messages csv file
        categories_filepath - path to location of categories csv files
    OUTPUT:
        df - pandas DF with messages and categories
    """
    # load messages dataset
    messages = pd.read_csv(messages_filepath)
    # load categories dataset
    categories = pd.read_csv(categories_filepath)

    # merge datasets
    df = pd.merge(messages, categories, on='id')
    return df


def clean_data(df):
    """
    Clean data in a pandas DF.
    Clean data in a pandas DF by renaming category columns, convert
    category values to 0 or 1 and drop duplicates.

    INPUT:
        df - pandas dataframe with messages and categories in source format

    OUTPUT:
        df - cleaned pandas dataframe with messages and categories
    """

    # create a dataframe of the 36 individual category columns
    categories = df['categories'].str.split(';', expand=True)
    # select the first row of the categories dataframe
    row = categories.iloc[0]

    # use this row to extract a list of new column names for categories.
    # one way is to apply a lambda function that takes everything
    # up to the second to last character of each string with slicing
    category_colnames = list(map(lambda col: col[:-2], row))

    # rename the columns of `categories`
    categories.columns = category_colnames

    for column in categories:
        # set each value to be the last character of the string
        categories[column] = categories[column].str[-1:]

        # convert column from string to numeric
        categories[column] = categories[column].astype(int)

    # drop the original categories column from `df`
    df = df.drop(['categories'], axis=1)

    # concatenate the original dataframe with the new `categories` dataframe
    df = pd.concat([df, categories], axis=1, sort=False)

    # drop duplicates
    df = df.drop_duplicates()
    return df


def save_data(df, database_filename):
    """
    Save data from a pandas df into SQLite database.

    INPUT:
      df -- pandas DF with messages and categories
      database_filename -- path to location of database file

    OUTPUT:

    """

    engine = create_engine('sqlite:///{}'.format(database_filename))
    df.to_sql('DisasterResponse', engine, index=False)


def main():
    if len(sys.argv) == 4:

        messages_filepath, categories_filepath, database_filepath = sys.argv[1:]

        print('Loading data...\n    MESSAGES: {}\n    CATEGORIES: {}'
              .format(messages_filepath, categories_filepath))
        df = load_data(messages_filepath, categories_filepath)

        print('Cleaning data...')
        df = clean_data(df)

        print('Saving data...\n    DATABASE: {}'.format(database_filepath))
        save_data(df, database_filepath)

        print('Cleaned data saved to database!')

    else:
        print('Please provide the filepaths of the messages and categories ' \
              'datasets as the first and second argument respectively, as ' \
              'well as the filepath of the database to save the cleaned data ' \
              'to as the third argument. \n\nExample: python process_data.py ' \
              'disaster_messages.csv disaster_categories.csv ' \
              'DisasterResponse.db')


if __name__ == '__main__':
    # to debug
    if len(sys.argv) == 0:
        sys.argv = ['.', './data/disaster_messages.csv', './data/disaster_categories.csv', './data/DisasterResponse.db']

    # run main thread
    main()