Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

671 post imputation function #5

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
4 changes: 2 additions & 2 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ jobs:
- name: create container
run: docker run -id --name container_${{matrix.cml_version}} -v"$(pwd)"://home/cdsw cml:${{matrix.cml_version}}
- name: build in dev mode
run: docker exec container_${{matrix.cml_version}} pip install ."[dev]"
run: docker exec container_${{matrix.cml_version}} pip install -e .[dev]
- name: check env
run: docker exec container_${{matrix.cml_version}} pip list
- name: test
run: docker exec container_${{matrix.cml_version}} pytest
run: docker exec container_${{matrix.cml_version}} pytest -v
5 changes: 5 additions & 0 deletions cons_results/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,10 @@
},
"logging": {
"file": "/var/log/construction_survey.log"
},
"question_number_mapping": {
"mapping_1": {"derive": 5,"from": [1, 2, 3, 4]},
"mapping_2":{"derive": 6,"from": [1, 2]},
"mapping_3":{"derive": 7,"from": [3, 4]}
}
}
43 changes: 43 additions & 0 deletions cons_results/imputation/impute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from warnings import warn

import pandas as pd
from mbs_results.imputation.ratio_of_means import ratio_of_means

from cons_results.imputation.post_imputation import post_imputation_processing


def impute(df: pd.DataFrame, config: dict) -> pd.DataFrame:
"""
main wrapper for imputation method on construction survey

Parameters
----------
df : pd.DataFrame
staged dataframe
config : dict
config dictionary

Returns
-------
pd.DataFrame
post imputation dataframe
"""
warn.warn("This method is not fully developed and acting as a placeholder")

pre_impute_dataframe = df.copy()
manual_constructions = None

post_impute = pre_impute_dataframe.groupby(config["question_no"]).apply(
lambda df: ratio_of_means(
df=df,
manual_constructions=manual_constructions,
reference=config["reference"],
target=config["target"],
period=config["period"],
strata="imputation_class",
auxiliary=config["auxiliary"],
)
)

post_impute = post_imputation_processing(df, **config)
return post_impute
260 changes: 260 additions & 0 deletions cons_results/imputation/post_imputation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
import warnings
from typing import List

import numpy as np
import pandas as pd

# Case 1 - all responses - calculate total (even if given or not this will be same)
# Case 2 - some non responses - no total, impute non responses and calculate total
# Case 3 - some non responses - given total - impute non responses and re weight imputed
# case 4 - all non responses - no total, impute non responses and calculate total

# Can the total be imputed? my guess would be no


def calculate_totals(df: pd.DataFrame, derive_from: List[int]) -> pd.DataFrame:

"""
Returns the sums of a dataframe in which the first level index is in
derive_from. The sum is based on common indices. Columns must contain float
or int.

Parameters
----------
df : pd.DataFrame
Dataframe to sum, first level index must contain values from derive_from

derive_from : List[int]
Values to take a subset of df.

Returns
-------
sums : pd.DataFrame
A dataframe with sums, constain marker, and columns from index which the
sum was based on.
"""

df_temp = df.fillna(0)

sums = sum(
[
df_temp.loc[question_no]
for question_no in derive_from
if question_no in df_temp.index
]
)
sums.rename(columns={"target": "derived_target"}, inplace=True)
return sums.assign(constrain_marker=f"sum{derive_from}").reset_index()


def create_derive_map():
"""
Function to create derive mapping dictionary
Will check the unique values for form types and remove this
from the dictionary if not present. handles error

Returns
-------
dict
Derived question mapping in a dictionary.
Removes form IDs which are not present in dataframe
"""
warnings.warn(
"This might need to be expanded to allow for multiple derived questions "
"and updated to correct question numbers"
)
derive_map = {
"derive": 5,
"from": [1, 2, 3, 4],
}
return derive_map


def post_imputation_processing(
df: pd.DataFrame,
period,
reference,
question_no,
target,
imputation_marker,
**config,
) -> pd.DataFrame:
"""
first outline of post imputation processing for the construction survey

Parameters
----------
df : pd.DataFrame
post imputation dataframe
period : _type_
period column name
reference : _type_
reference column name
question_no : _type_
question number column name
target : _type_
target column name
imputation_marker : _type_
imputation marker column name
**config: Dict
main pipeline configuration. Can be used to input the entire config dictionary
Returns
-------
pd.DataFrame
post imputation dataframe with derived questions and rescaled values where
needed
"""
question_no_mapping = create_derive_map()

df_subset = df.set_index(
[question_no, period, reference], verify_integrity=False, drop=True
)
df_subset = df_subset[[target]]

derived_values = calculate_totals(df_subset, question_no_mapping["from"]).assign(
**{question_no: question_no_mapping["derive"]}
)

final_constrained = pd.merge(
df, derived_values, on=[question_no, period, reference], how="outer"
)

final_constrained = (
final_constrained.groupby([period, reference])
.apply(
lambda group_df: rescale_imputed_values(
group_df, question_no, target, imputation_marker, question_no_mapping
)
)
.reset_index(drop=True)
)

return final_constrained


def rescale_imputed_values(
df: pd.DataFrame,
question_no: str,
target: str,
imputation_marker: str,
question_no_mapping: dict,
drop_intermediate_cols: bool = False,
) -> pd.DataFrame:
"""
rescales imputed / constructed values if total is a return.

Parameters
----------
df : pd.DataFrame
original dataframe, grouped by period and reference
question_no : str
question number column name
target : str
target column name
imputation_marker : str
imputation marker column name
question_no_mapping : dict
dictionary containing question number derived and question numbers summed

Returns
-------
pd.DataFrame
original dataframe with adjusted values and rescale factors
"""

reference_value = df["reference"].unique()[0]
# question_no_mapping is nested dict
derived_question_no_list = []
for i in question_no_mapping:
derived_question_no = question_no_mapping[i]["derive"]
derived_question_no_list.append(derived_question_no)
print(derived_question_no_list)

derived_question_mask = df[question_no].isin(derived_question_no_list)

# Checking if target and derived target are equal
if df.loc[derived_question_mask, target].equals(
df.loc[derived_question_mask, "derived_target"]
):
df["adjusted_value"] = df[target]
df["rescale_factor"] = np.nan
return df

# Check if all markers are 'r'
if (df[imputation_marker].nunique() == 1) and (
"r" in df[imputation_marker].unique()
):
warnings.warn(
"Derived and returned value are not equal."
+ f"All other values are returns. reference: {reference_value} \n"
)
df["adjusted_value"] = df[target]
df["rescale_factor"] = np.nan
return df

# Handles if target is NaN i.e. not returned total
if df.loc[derived_question_mask, target].isna().any():
df["adjusted_value"] = df[target]
df.loc[derived_question_mask, "adjusted_value"] = df.loc[
derived_question_mask, "derived_target"
]
df["rescale_factor"] = np.nan
return df

# If target and derived_target values are not equal for derived question, rescale
sum_returned_exclude_total = df.loc[
(~df[question_no].isin(derived_question_no_list))
& (df[imputation_marker] == "r"),
target,
].sum()
sum_imputed = df.loc[
(~df[question_no].isin(derived_question_no_list))
& (df[imputation_marker] != "r"),
target,
].sum()

# Calculate the rescale factor
rescale_factor = (
df.loc[derived_question_mask, target].values[0] - sum_returned_exclude_total
) / sum_imputed
df["rescale_factor"] = np.where(df[imputation_marker] != "r", rescale_factor, 1)
df.loc[derived_question_mask, "rescale_factor"] = np.nan

# Apply the rescale factor to the target values
df["adjusted_value"] = df[target] * df["rescale_factor"]

# Set derived question value to target if a return, derived otherwise
df.loc[derived_question_mask, "adjusted_value"] = np.where(
df.loc[derived_question_mask, imputation_marker] == "r",
df.loc[derived_question_mask, target],
df.loc[derived_question_mask, "derived_target"],
)

if drop_intermediate_cols:
df.drop(columns=["adjusted_value"], inplace=True)

return df # Return the modified DataFrame


if __name__ == "__main__":
derive_map_nested = {
"map_1": {
"derive": 5,
"from": [1, 2, 3, 4],
},
"map_2": {
"derive": 6,
"from": [
1,
2,
],
},
}

derive_map = {"derive": 5, "from": [1, 2, 3, 4]}
q_list = []
for i in derive_map:
if i == "derive":
q_no = derive_map.get(i)
q_list.append(q_no)
print(q_list)
42 changes: 42 additions & 0 deletions tests/data/imputation/test_data_rescale_imputed_double.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
period,reference,question_no,target,marker,derived_target,constrain_marker,adjusted_value,rescale_factor
2020,100,1,10,r,,,10,
2020,100,2,20,r,,,20,
2020,100,3,30,r,,,30,
2020,100,4,40,r,,,40,
2020,100,5,100,r,100,"sum[1, 2, 3, 4]",100,
2020,100,6,30,r,30,"sum[1, 2]",30,

2020,101,1,10,r,,,10,
2020,101,2,11,r,,,11,
2020,101,3,12,r,,,12,
2020,101,4,13,r,,,13,
2020,101,5,50,r,46,"sum[1, 2, 3, 4]",50,
2020,101,6,25,r,21,"sum[1, 2]",25,

2020,102,1,10,r,,,10,1
2020,102,2,20,fir,,,16,0.8
2020,102,3,30,fir,,,24,0.8
2020,102,4,40,r,,,40,1
2020,102,5,90,r,100,"sum[1, 2, 3, 4]",90,
2020,102,6,26,r,30,"sum[1, 2]",26,

2020,103,1,14,bir,,,10.9375,0.78125
2020,103,2,20,fir,,,15.625,0.78125
2020,103,3,30,fir,,,23.4375,0.78125
2020,103,4,40,r,,,40,1
2020,103,5,90,r,104,"sum[1, 2, 3, 4]",90,
2020,103,6,90,r,104,"sum[1, 2,]",90,

2020,104,1,10,fir,,,10,
2020,104,2,10,fic,,,10,
2020,104,3,10,bir,,,10,
2020,104,4,25,c,,,25,
2020,104,5,,,55,"sum[1, 2, 3, 4]",55,
2020,104,6,,,20,"sum[1, 2]",20,

2020,105,1,10,fir,,,9,0.9
2020,105,2,20,fic,,,18,0.9
2020,105,3,30,bir,,,27,0.9
2020,105,4,40,c,,,36,0.9
2020,105,5,90,r,100,"sum[1, 2, 3, 4]",90,
2020,105,6,27,r,30,"sum[1, 2]",27,
Loading
Loading