diff --git a/.github/has-functional-changes.sh b/.github/has-functional-changes.sh index 48f97805..b010f0d1 100755 --- a/.github/has-functional-changes.sh +++ b/.github/has-functional-changes.sh @@ -3,7 +3,7 @@ IGNORE_DIFF_ON="README.md CONTRIBUTING.md Makefile .gitignore .github/*" last_tagged_commit=`git describe --tags --abbrev=0 --first-parent` # --first-parent ensures we don't follow tags not published in master through an unlikely intermediary merge commit - +echo ".github/has-functional-changes.sh : last_tagged_commit=$last_tagged_commit" if git diff-index --name-only --exit-code $last_tagged_commit -- . `echo " $IGNORE_DIFF_ON" | sed 's/ / :(exclude)/g'` # Check if any file that has not be listed in IGNORE_DIFF_ON has changed since the last tag was published. then echo "No functional changes detected." diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e41981c3..ce4dc8ae 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,4 +1,4 @@ -include: 'gitlab_ci/all_years_build_and_aggregates.yml' +include: '.gitlab-ci/all_years_build_and_aggregates.yml' variables: PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip" @@ -7,6 +7,7 @@ variables: # OUT_FOLDER: "$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA" # For branch-commit_id OUT_FOLDER: "$CI_COMMIT_REF_NAME" # For just branch ROOT_FOLDER: "/mnt/data-out/openfisca-france-data" +# GIT_DEPTH: 1000 # To be able to get last tag (default 50) cache: paths: @@ -25,14 +26,12 @@ stages: - aggregates_all - anaconda - before_script: # To be sure we are up to date even if we do not rebuild docker image - make install - - cp ./ci-runner/openfisca_survey_manager_raw_data.ini ~/.config/openfisca-survey-manager/raw_data.ini + - cp ./.gitlab-ci/openfisca_survey_manager_raw_data.ini ~/.config/openfisca-survey-manager/raw_data.ini - echo "End of before_script" - build docker image: stage: docker tags: @@ -51,7 +50,6 @@ build docker image: # Build Docker is needed only if code as changed. when: manual - test: image: $CI_REGISTRY_IMAGE:latest script: @@ -60,7 +58,6 @@ test: tags: - openfisca - clean_folder: before_script: - '' @@ -70,7 +67,6 @@ clean_folder: - openfisca when: manual - build_collection: image: $CI_REGISTRY_IMAGE:latest script: @@ -78,7 +74,7 @@ build_collection: - rm -rf $ROOT_FOLDER/$OUT_FOLDER || true - mkdir -p $ROOT_FOLDER/$OUT_FOLDER/data_collections/ - mkdir -p $ROOT_FOLDER/$OUT_FOLDER/data_output/ - - cp ./ci-runner/openfisca_survey_manager_config.ini ~/.config/openfisca-survey-manager/config.ini + - cp ./.gitlab-ci/openfisca_survey_manager_config.ini ~/.config/openfisca-survey-manager/config.ini - echo "Custom output folder" - sed -i "s/BRANCH_NAME/$OUT_FOLDER/" ~/.config/openfisca-survey-manager/config.ini - 'echo "{\"name\": \"erfs_fpr\", \"surveys\": {}}" > $ROOT_FOLDER/$OUT_FOLDER/data_collections/erfs_fpr.json' @@ -94,7 +90,6 @@ build_collection: - openfisca when: manual - copy_previous_build_collections: before_script: - '' @@ -108,7 +103,7 @@ copy_previous_build_collections: cp $ROOT_FOLDER/master/openfisca_survey_manager_config-after-build-collection.ini $ROOT_FOLDER/$OUT_FOLDER/openfisca_survey_manager_config-after-build-collection.ini sed -i "s/master/$OUT_FOLDER/" $ROOT_FOLDER/$OUT_FOLDER/openfisca_survey_manager_config-after-build-collection.ini cp $ROOT_FOLDER/master/data_collections/erfs_fpr.json $ROOT_FOLDER/$OUT_FOLDER/data_collections/erfs_fpr.json - cp ./ci-runner/empty_openfisca_erfs_fpr.json $ROOT_FOLDER/$OUT_FOLDER/data_collections/openfisca_erfs_fpr.json + cp ./.gitlab-ci/empty_openfisca_erfs_fpr.json $ROOT_FOLDER/$OUT_FOLDER/data_collections/openfisca_erfs_fpr.json fi stage: build_collection tags: @@ -128,19 +123,25 @@ diagnostics: - cp $ROOT_FOLDER/$OUT_FOLDER/openfisca_survey_manager_config_input_data-after-build-erfs-fprs-2019.ini ~/.config/openfisca-survey-manager/config.ini - mkdir -p ~/.config/openfisca-france-data - - cp ./ci-runner/openfisca_france_data_config.ini + - cp ./.gitlab-ci/openfisca_france_data_config.ini ~/.config/openfisca-france-data/config.ini - sed -i "s/BRANCH_NAME/$OUT_FOLDER/" ~/.config/openfisca-france-data/config.ini - cat ~/.config/openfisca-france-data/config.ini - compare-erfs-fpr-input -u -s -v - cp -r /mnt/data-out/openfisca-france-data/$OUT_FOLDER/figures_directory . - - ls -alrth - - ls -alrth figures_directory - cp -r ./figures_directory $ROOT_FOLDER/$OUT_FOLDER/data_output stage: diagnostics tags: - openfisca +check-version-and-changelog: + stage: diagnostics + before_script: + - '' + needs: + - input_data-2019 + script: + - .gitlab-ci/is-version-number-acceptable.sh run_on_all_years: stage: run_on_all_years @@ -151,15 +152,26 @@ run_on_all_years: - echo "On ne fait rien" when: manual +check-for-functional-changes: + stage: anaconda + needs: + - check-version-and-changelog + before_script: + - '' + script: + - if `.github/has-functional-changes.sh` ; then echo "OK to build package" ; fi + only: + - master build_conda_package: + stage: anaconda + needs: + - check-for-functional-changes before_script: - '' - except: + only: - master image: continuumio/miniconda3 script: - conda install -y conda-build anaconda-client - - conda build -c conda-forge -c openfisca --token $ANACONDA_TOKEN --user OpenFisca - .conda - stage: anaconda + - conda build -c conda-forge -c openfisca --token $ANACONDA_TOKEN --user OpenFisca .conda diff --git a/ci-runner/README.md b/.gitlab-ci/README.md similarity index 93% rename from ci-runner/README.md rename to .gitlab-ci/README.md index e11492ca..094679db 100644 --- a/ci-runner/README.md +++ b/.gitlab-ci/README.md @@ -7,7 +7,7 @@ This folder contains files needed for the CI. To separate the different years and survey files we have a script that build the CI script. ``` -python ci-runner/build_ci.py +python .gitlab-ci/build_ci.py ``` It will create the file `.gitlab-ci.yml` that is read by Gitlab Runner to execute the CI. @@ -31,8 +31,8 @@ All following steps is run with this docker image. It is a manual step because it does not to be build each time and took a very long time : between 2 and 4 hours. It use the `build-collection` command from [OpenFisca-Survey-Manager](https://github.com/openfisca/openfisca-survey-manager). Input : -- [../ci-runner/openfisca_survey_manager_config.ini](ci-runner/openfisca_survey_manager_config.ini) -- [../ci-runner/openfisca_survey_manager_raw_data.ini](ci-runner/openfisca_survey_manager_raw_data.ini) +- [../.gitlab-ci/openfisca_survey_manager_config.ini](.gitlab-ci/openfisca_survey_manager_config.ini) +- [../.gitlab-ci/openfisca_survey_manager_raw_data.ini](.gitlab-ci/openfisca_survey_manager_raw_data.ini) - All survey's files located in `/mnt/data-in/erfs-fpr/` folder that is accessible to the CI Runner. Output : diff --git a/gitlab_ci/all_years_build_and_aggregates.yml b/.gitlab-ci/all_years_build_and_aggregates.yml similarity index 99% rename from gitlab_ci/all_years_build_and_aggregates.yml rename to .gitlab-ci/all_years_build_and_aggregates.yml index 145e4557..36c088d0 100644 --- a/gitlab_ci/all_years_build_and_aggregates.yml +++ b/.gitlab-ci/all_years_build_and_aggregates.yml @@ -1,7 +1,7 @@ ################################################ # GENERATED FILE, DO NOT EDIT -# Please visit ci-runner/README.md +# Please visit .gitlab-ci/README.md ################################################ input_data-2019: @@ -9,9 +9,8 @@ input_data-2019: script: - echo "build_input_data-2019" - mkdir -p $ROOT_FOLDER/$OUT_FOLDER - - cp $ROOT_FOLDER/$OUT_FOLDER/openfisca_survey_manager_config-after-build-collection.ini ~/.config/openfisca-survey-manager/config.ini - - cat ~/.config/openfisca-survey-manager/config.ini - - ls $ROOT_FOLDER/$OUT_FOLDER/data_collections + - cp $ROOT_FOLDER/$OUT_FOLDER/openfisca_survey_manager_config-after-build-collection.ini + ~/.config/openfisca-survey-manager/config.ini - build-erfs-fpr -y 2019 -f $ROOT_FOLDER/$OUT_FOLDER/data_output/erfs_flat_2019.h5 - cp ~/.config/openfisca-survey-manager/config.ini $ROOT_FOLDER/$OUT_FOLDER/openfisca_survey_manager_config_input_data-after-build-erfs-fprs-2019.ini stage: build_input_data diff --git a/ci-runner/build_ci.py b/.gitlab-ci/build_ci.py similarity index 95% rename from ci-runner/build_ci.py rename to .gitlab-ci/build_ci.py index 014c0acc..3553e39e 100644 --- a/ci-runner/build_ci.py +++ b/.gitlab-ci/build_ci.py @@ -3,20 +3,20 @@ Run in project root folder: -python ci-runner/build_ci.py +python .gitlab-ci/build_ci.py """ import configparser import yaml # Config file use to get the available years -CONFIG = "./ci-runner/openfisca_survey_manager_raw_data.ini" +CONFIG = "./.gitlab-ci/openfisca_survey_manager_raw_data.ini" def header(): return """ ################################################ # GENERATED FILE, DO NOT EDIT -# Please visit ci-runner/README.md +# Please visit .gitlab-ci/README.md ################################################ """ @@ -111,7 +111,6 @@ def get_erfs_years(): raise KeyError - def build_gitlab_ci(erfs_years): gitlab_ci = header() # gitlab_ci += yaml.dump(make_test()) @@ -134,7 +133,7 @@ def main(): # For testing only some years # erfs_years = ["2016", "2017", "2018"] gitlab_ci = build_gitlab_ci(erfs_years) - with open(r"./gitlab_ci/all_years_build_and_aggregates.yml", mode="w") as file: + with open(r"./.gitlab-ci/all_years_build_and_aggregates.yml", mode="w") as file: file.write(gitlab_ci) print("Done with success!") diff --git a/ci-runner/empty_openfisca_erfs_fpr.json b/.gitlab-ci/empty_openfisca_erfs_fpr.json similarity index 100% rename from ci-runner/empty_openfisca_erfs_fpr.json rename to .gitlab-ci/empty_openfisca_erfs_fpr.json diff --git a/.gitlab-ci/has-functional-changes.sh b/.gitlab-ci/has-functional-changes.sh new file mode 100755 index 00000000..0784a74b --- /dev/null +++ b/.gitlab-ci/has-functional-changes.sh @@ -0,0 +1,18 @@ +#! /usr/bin/env bash + +IGNORE_DIFF_ON="README.md CONTRIBUTING.md Makefile .gitignore .github/*" + +# Fetch all tags +git fetch --tags + +last_tagged_commit=`git tag --list | sort -V | grep -v ipp | tail -1` +echo ".gitlab-ci/has-functional-changes.sh : avec git tag --list last_tagged_commit=$last_tagged_commit" + +# Check if any file that has not be listed in IGNORE_DIFF_ON has changed since the last tag was published. +if git diff-index --name-only --exit-code $last_tagged_commit -- . `echo " $IGNORE_DIFF_ON" | sed 's/ / :(exclude)/g'` +then + echo "No functional changes detected." + exit 1 +else + echo "The functional files above were changed." +fi diff --git a/.gitlab-ci/is-version-number-acceptable.sh b/.gitlab-ci/is-version-number-acceptable.sh new file mode 100755 index 00000000..ad7850a2 --- /dev/null +++ b/.gitlab-ci/is-version-number-acceptable.sh @@ -0,0 +1,47 @@ +#! /usr/bin/env bash + +if [[ ${CI_COMMIT_REF_NAME} == master ]] +then + echo "No need for a version check on master.👍" + exit 0 +else + echo "Not on master." +fi + +if ! $(dirname "$BASH_SOURCE")/has-functional-changes.sh +then + echo "No need for a version update.👍" + exit 0 +else + echo "Need for a version update." +fi + +current_version=`python setup.py --version` + +if git rev-parse --verify --quiet $current_version +then + echo "Version $current_version already exists in commit:" + git --no-pager log -1 $current_version + echo + echo "Update the version number in setup.py before merging this branch into master.😒" + echo "Look at the CONTRIBUTING.md file to learn how the version number should be updated." + exit 1 +else + echo "Version $current_version don't exists in commit history 👍" +fi + +if ! $(dirname "$BASH_SOURCE")/has-functional-changes.sh | grep --quiet CHANGELOG.md +then + echo "CHANGELOG.md has not been modified, while functional changes were made. 😒" + echo "Explain what you changed before merging this branch into master." + echo "Look at the CONTRIBUTING.md file to learn how to write the changelog." + exit 2 +else + if ! grep --quiet $current_version CHANGELOG.md + then + echo "CHANGELOG.md has been modified. BUT $current_version don't exists in it. 😒" + exit 3 + else + echo "CHANGELOG.md has been modified and $current_version exists in it. 👍" + fi +fi diff --git a/ci-runner/openfisca_france_data_config.ini b/.gitlab-ci/openfisca_france_data_config.ini similarity index 100% rename from ci-runner/openfisca_france_data_config.ini rename to .gitlab-ci/openfisca_france_data_config.ini diff --git a/ci-runner/openfisca_survey_manager_config.ini b/.gitlab-ci/openfisca_survey_manager_config.ini similarity index 89% rename from ci-runner/openfisca_survey_manager_config.ini rename to .gitlab-ci/openfisca_survey_manager_config.ini index a75a227a..0507bcb9 100644 --- a/ci-runner/openfisca_survey_manager_config.ini +++ b/.gitlab-ci/openfisca_survey_manager_config.ini @@ -1,6 +1,6 @@ # Template du fichier config.ini de openfisca-survey-manager # pour qu'il fonctionne avec oepnfisca-france-data -# sur le ci-runner gitlab piloté par ipp/openfisca-france/data +# sur le runner gitlab CI piloté par ipp/openfisca-france/data [collections] collections_directory = /mnt/data-out/openfisca-france-data/BRANCH_NAME/data_collections diff --git a/ci-runner/openfisca_survey_manager_raw_data.ini b/.gitlab-ci/openfisca_survey_manager_raw_data.ini similarity index 95% rename from ci-runner/openfisca_survey_manager_raw_data.ini rename to .gitlab-ci/openfisca_survey_manager_raw_data.ini index 00de83c8..6231dda7 100644 --- a/ci-runner/openfisca_survey_manager_raw_data.ini +++ b/.gitlab-ci/openfisca_survey_manager_raw_data.ini @@ -1,6 +1,6 @@ # Template du fichier raw_data.ini de openfisca-survey-manager # pour qu'il fonctionne avec oepnfisca-france-data -# sur le ci-runner gitlab piloté par ipp/openfisca-france/data +# sur le runner GitLab CI piloté par ipp/openfisca-france/data ; [enquete_logement] ; 2006 = /home/ipp/data/enquete_logement/2006/stata diff --git a/README.md b/README.md index 26031b9c..9ef2f0ec 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,12 @@ This should not display any error and end with: `Successfully installed OpenFisca-France-Data...` +### Specifics due to Windows' handling of long paths + +On a Windows machine, the installation of Openfisca-France-Data may run into problems due to long path names, which Windows, by default, does not handle. These long paths are mostly inherited from the OpenFisca-France parameters, which are stored in a sometimes deeply nested folder. + +A possible workaround on Windows >= 10 is to lift the maximum path length limitation (as [indicated here](https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry#enable-long-paths-in-windows-10-version-1607-and-later)). + ## Execution Let's say that you would like to format `ERFS-FPR` survey data into OpenFisca formatted data. diff --git a/openfisca_france_data/__init__.py b/openfisca_france_data/__init__.py index 8bb2ec7c..9b936da8 100644 --- a/openfisca_france_data/__init__.py +++ b/openfisca_france_data/__init__.py @@ -9,7 +9,7 @@ import openfisca_france # type: ignore # Load input variables and output variables into entities -from openfisca_france_data.model import common, survey_variables, id_variables # noqa analysis:ignore +from openfisca_france_data.model import common, survey_variables # noqa analysis:ignore from openfisca_france_data.model.base import * # noqa analysis:ignore diff --git a/openfisca_france_data/common.py b/openfisca_france_data/common.py index de8cf677..15bdd229 100644 --- a/openfisca_france_data/common.py +++ b/openfisca_france_data/common.py @@ -104,7 +104,7 @@ def create_salaire_de_base(individus, period = None, revenu_type = 'imposable', if name not in target: baremes_to_remove.append(name) - # We split since we cannot remove from dict while iterating + # We split since we cannot remove from dict while iterating for name in baremes_to_remove: del baremes_collection._children[name] @@ -230,15 +230,15 @@ def create_traitement_indiciaire_brut(individus, period = None, revenu_type = 'i contrat_de_travail = individus.contrat_de_travail heures_remunerees_volume = individus.heures_remunerees_volume - legislation = tax_benefit_system.parameters(period.start) + parameters = tax_benefit_system.parameters(period.start) - salarie = legislation.cotsoc.cotisations_salarie - plafond_securite_sociale_mensuel = legislation.prelevements_sociaux.pss.plafond_securite_sociale_mensuel - legislation_csg_deductible = legislation.prelevements_sociaux.contributions_sociales.csg.activite.deductible - taux_csg = legislation_csg_deductible.taux - taux_abattement = legislation_csg_deductible.abattement.rates[0] + salarie = parameters.cotsoc.cotisations_salarie + plafond_securite_sociale_mensuel = parameters.prelevements_sociaux.pss.plafond_securite_sociale_mensuel + parameters_csg_deductible = parameters.prelevements_sociaux.contributions_sociales.csg.activite.deductible + taux_csg = parameters_csg_deductible.taux + taux_abattement = parameters_csg_deductible.abattement.rates[0] try: - seuil_abattement = legislation_csg_deductible.abattement.thresholds[1] + seuil_abattement = parameters_csg_deductible.abattement.thresholds[1] except IndexError: # Pour gérer le fait que l'abattement n'a pas toujours été limité à 4 PSS seuil_abattement = None csg_deductible = MarginalRateTaxScale(name = 'csg_deductible') @@ -250,11 +250,11 @@ def create_traitement_indiciaire_brut(individus, period = None, revenu_type = 'i # Cas des revenus nets: # comme les salariés du privé, on ajoute CSG imposable et crds qui s'appliquent à tous les revenus # 1. csg imposable - legislation_csg_imposable = legislation.prelevements_sociaux.contributions_sociales.csg.activite.imposable - taux_csg = legislation_csg_imposable.taux - taux_abattement = legislation_csg_imposable.abattement.rates[0] + parameters_csg_imposable = parameters.prelevements_sociaux.contributions_sociales.csg.activite.imposable + taux_csg = parameters_csg_imposable.taux + taux_abattement = parameters_csg_imposable.abattement.rates[0] try: - seuil_abattement = legislation_csg_imposable.abattement.thresholds[1] + seuil_abattement = parameters_csg_imposable.abattement.thresholds[1] except IndexError: # Pour gérer le fait que l'abattement n'a pas toujours été limité à 4 PSS seuil_abattement = None csg_imposable = MarginalRateTaxScale(name = 'csg_imposable') @@ -262,17 +262,17 @@ def create_traitement_indiciaire_brut(individus, period = None, revenu_type = 'i if seuil_abattement is not None: csg_imposable.add_bracket(seuil_abattement, taux_csg) # 2. crds - legislation_crds = legislation.prelevements_sociaux.contributions_sociales.crds.activite - taux_csg = legislation_crds.taux - taux_abattement = legislation_crds.abattement.rates[0] + parameters_crds = parameters.prelevements_sociaux.contributions_sociales.crds.activite + taux_crds = parameters_crds.taux + taux_abattement = parameters_crds.abattement.rates[0] try: - seuil_abattement = legislation_crds.abattement.thresholds[1] + seuil_abattement = parameters_crds.abattement.thresholds[1] except IndexError: # Pour gérer le fait que l'abattement n'a pas toujours été limité à 4 PSS seuil_abattement = None crds = MarginalRateTaxScale(name = 'crds') - crds.add_bracket(0, taux_csg * (1 - taux_abattement)) + crds.add_bracket(0, taux_crds * (1 - taux_abattement)) if seuil_abattement is not None: - crds.add_bracket(seuil_abattement, taux_csg) + crds.add_bracket(seuil_abattement, taux_crds) # Check baremes target = dict() @@ -404,6 +404,9 @@ def create_revenus_remplacement_bruts(individus, period, tax_benefit_system): parameters = tax_benefit_system.get_parameters_at_instant(period.start) csg = parameters.prelevements_sociaux.contributions_sociales.csg csg_deductible_chomage = csg.remplacement.allocations_chomage.deductible + pss = parameters.prelevements_sociaux.pss.plafond_securite_sociale_annuel + taux_abattement_csg_chomage = parameters.prelevements_sociaux.contributions_sociales.csg.remplacement.allocations_chomage.deductible.abattement.rates[0] + seuil_abattement_csg_chomage = parameters.prelevements_sociaux.contributions_sociales.csg.remplacement.allocations_chomage.deductible.abattement.thresholds[1] taux_plein = csg_deductible_chomage.taux_plein taux_reduit = csg_deductible_chomage.taux_reduit seuil_chomage_net_exoneration = ( @@ -417,12 +420,23 @@ def create_revenus_remplacement_bruts(individus, period, tax_benefit_system): (individus.taux_csg_remplacement < 2) | (individus.chomage_imposable <= seuil_chomage_net_exoneration) ) + taux_csg_chomage = np.where( + individus.taux_csg_remplacement < 2, + 0, + (individus.taux_csg_remplacement == 2) * taux_reduit + + (individus.taux_csg_remplacement >= 3) * taux_plein + ) + threshold = seuil_abattement_csg_chomage * pss * (1 - (taux_csg_chomage * (1 - taux_abattement_csg_chomage))) + base_csg_chomage = np.where( + individus.chomage_imposable <= threshold, + individus.chomage_imposable * (1 - taux_abattement_csg_chomage) / (1 - (taux_csg_chomage * (1 - taux_abattement_csg_chomage))), + (individus.chomage_imposable - seuil_abattement_csg_chomage * taux_abattement_csg_chomage) / (1 - taux_csg_chomage) + ) individus['chomage_brut'] = np.where( exonere_csg_chomage, individus.chomage_imposable, - (individus.taux_csg_remplacement == 2) * individus.chomage_imposable / (1 - taux_reduit) - + (individus.taux_csg_remplacement >= 3) * individus.chomage_imposable / (1 - taux_plein) - ) + individus.chomage_imposable + (base_csg_chomage * taux_csg_chomage) + ) assert individus['chomage_brut'].notnull().all() csg_deductible_retraite = parameters.prelevements_sociaux.contributions_sociales.csg.remplacement.pensions_retraite_invalidite.deductible diff --git a/openfisca_france_data/comparator.py b/openfisca_france_data/comparator.py index 2258d41e..d788bfe2 100644 --- a/openfisca_france_data/comparator.py +++ b/openfisca_france_data/comparator.py @@ -23,18 +23,20 @@ log = logging.getLogger(__name__) +def get_entity_original_id(survey_scenario, variable): + entity = survey_scenario.tax_benefit_system.variables[variable].entity.key + return "noindiv" if entity == "individu" else "idmen_original" + + def compute_result(variable, survey_scenario, target_dataframe): result = None stats = None - entity = survey_scenario.tax_benefit_system.variables[variable].entity.key - entity_original_id = "noindiv" if entity == "individu" else "ident" + entity_original_id = get_entity_original_id(survey_scenario, variable) output_variables = [entity_original_id, variable] - entity_dataframe = survey_scenario.create_data_frame_by_entity( variables = output_variables, )[entity] - target = target_dataframe[output_variables].rename(columns = {variable: f"target_{variable}"}) if f"target_{variable}" not in target: @@ -47,12 +49,12 @@ def compute_result(variable, survey_scenario, target_dataframe): ) result[f"diff_{variable}"] = result[variable] - result[f"target_{variable}"] - result_variables = ["noindiv", variable, f"diff_{variable}", f"target_{variable}"] - stats = compute_error_stats(result, variable) + result_variables = [entity_original_id, variable, f"diff_{variable}", f"target_{variable}"] + stats = compute_error_stats(result, variable, entity_original_id = entity_original_id) return result, stats -def compute_confidence_interval(data, variable, width = .9): +def compute_confidence_interval(data, variable, width = .9, entity_original_id = None): """ Compute confidence interval @@ -64,9 +66,10 @@ def compute_confidence_interval(data, variable, width = .9): Returns: [type]: [description] """ + assert entity_original_id is not None df = pd.DataFrame({ "signed_values": data[variable].values, - "noind": data["noindiv"].values + "noind": data[entity_original_id].values }) df["abs_values"] = df.signed_values.abs() in_range_obs = ceil(width * len(df)) @@ -79,7 +82,8 @@ def compute_confidence_interval(data, variable, width = .9): return left, right, largest_errors -def compute_error_stats(data, variable): +def compute_error_stats(data, variable, entity_original_id): + assert entity_original_id is not None numerical = ( isinstance(data[variable].values.flat[0], np.integer) or isinstance(data[variable].values.flat[0], np.floating) @@ -89,13 +93,13 @@ def compute_error_stats(data, variable): df = data.loc[ (data[variable].values != 0.0) | (data[f"target_{variable}"].values != 0.0), - [variable, f"target_{variable}", "noindiv"] + [variable, f"target_{variable}", entity_original_id] ].copy() df["relative_error"] = (df[variable] - df[f"target_{variable}"]) / (df[f"target_{variable}"] + (df[f"target_{variable}"] == 0.0) * df[variable]) if df.empty: return - left, right, largest_errors = compute_confidence_interval(df, "relative_error") + left, right, largest_errors = compute_confidence_interval(df, "relative_error", entity_original_id = entity_original_id) less_than_5_pc_error = (df["relative_error"].abs() <= .05).sum() / len(df) less_than_20_pc_error = (df["relative_error"].abs() <= .2).sum() / len(df) more_than_80_pc_error = (df["relative_error"].abs() >= .8).sum() / len(df) @@ -130,7 +134,8 @@ def create_output_files(markdown_sections, figures_directory, filename): ) -def create_variable_distribution_figures(variable, result, bins = None, figures_directory = None): +def create_variable_distribution_figures(variable, result, bins = None, figures_directory = None, entity_original_id = None): + assert entity_original_id is not None log.debug(f"create_variable_distribution_figures: Examining {variable}") assert figures_directory is not None if bins is None: @@ -139,11 +144,12 @@ def create_variable_distribution_figures(variable, result, bins = None, figures_ non_both_zeroes = (result[f"{variable}"].fillna(0) != 0) | (result[f"target_{variable}"].fillna(0) != 0) non_both_zeroes_count = sum(non_both_zeroes) both_zeroes_count = len(result) - non_both_zeroes_count + melted = result.loc[ non_both_zeroes, - ["noindiv", variable, f"target_{variable}"] + [entity_original_id, variable, f"target_{variable}"] ].melt( - id_vars = ["noindiv"], + id_vars = [entity_original_id], value_vars = [f"{variable}", f"target_{variable}"] ) @@ -153,11 +159,13 @@ def create_variable_distribution_figures(variable, result, bins = None, figures_ unique_values_count = melted["value"].nunique() - bins == unique_values_count if unique_values_count < bins else bins + bins = unique_values_count if unique_values_count < bins else bins print(f"create_variable_distribution_figures (total): variable = {variable}, bins = {bins}") melted["value"] = melted["value"].clip(1, melted["value"].max()) + log_scale = bins > 10 + sns_plot = sns.histplot( data = melted, # palette = "crest", @@ -169,7 +177,7 @@ def create_variable_distribution_figures(variable, result, bins = None, figures_ hue = "variable", linewidth = 0, x = "value", - log_scale = True, + log_scale = log_scale, ) sns_plot.annotate( @@ -246,8 +254,8 @@ def create_variable_markdown_summary_section(variable, stats, figures_directory) return markdown_section -def create_diff_variable_distribution_figures(variable, result, bins = None, figures_directory = None): - +def create_diff_variable_distribution_figures(variable, result, bins = None, figures_directory = None, entity_original_id = None): + assert entity_original_id is not None numerical = ( isinstance(result[f"{variable}"].values.flat[0], np.integer) or isinstance(result[f"{variable}"].values.flat[0], np.floating) @@ -475,8 +483,18 @@ def compute_divergence(self, input_dataframe_by_entity, target_dataframe_by_enti ) result_by_variable[variable] = result - variable_distribution_figures_created = create_variable_distribution_figures(variable, result, figures_directory = figures_directory) - diff_variable_distribution_figures_created = create_diff_variable_distribution_figures(variable, result, figures_directory = figures_directory) + variable_distribution_figures_created = create_variable_distribution_figures( + variable, + result, + figures_directory = figures_directory, + entity_original_id = get_entity_original_id(survey_scenario, variable), + ) + diff_variable_distribution_figures_created = create_diff_variable_distribution_figures( + variable, + result, + figures_directory = figures_directory, + entity_original_id = get_entity_original_id(survey_scenario, variable) + ) stats_by_variable[variable] = stats variable_markdown_section = create_variable_markdown_section( diff --git a/openfisca_france_data/debugger.py b/openfisca_france_data/debugger.py deleted file mode 100644 index 8ed43ac2..00000000 --- a/openfisca_france_data/debugger.py +++ /dev/null @@ -1,437 +0,0 @@ -#! /usr/bin/env python -import logging - - -import numpy -from pandas import merge, concat, DataFrame - - -from openfisca_france_data.erfs.input_data_builder.base import ( - year_specific_by_generic_data_frame_name) -from openfisca_france_data.erfs import get_erf2of, get_of2erf -from openfisca_plugin_aggregates.aggregates import Aggregates -from openfisca_survey_manager.statshelpers import mark_weighted_percentiles as mwp -from openfisca_survey_manager.survey_collections import SurveyCollection - - -from openfisca_parsers import input_variables_extractors - -log = logging.getLogger(__name__) - - -def clean(parameter): - return parameter[:-len('_holder')] if parameter.endswith('_holder') else parameter - - -class Debugger(object): - def __init__(self): - super(Debugger, self).__init__() - self.erf_menage = None - self.erf_eec_indivi = None - self.of_menages_data_frame = None - self.of_individus_data_frame = None - self.variable = None - self.survey_scenario = None - - def set_survey_scenario(self, survey_scenario = None): - assert survey_scenario is not None - self.survey_scenario = survey_scenario - self.variables = self.survey_scenario.simulation.tax_benefit_system.variables - self.simulation = self.survey_scenario.simulation - assert survey_scenario.simulation is not None, "The simulation attibute of survey_scenario is None" - - def set_variable(self, variable): - if isinstance(variable, list): - self.variable = variable[0] - else: - self.variable = variable - - def show_aggregates(self): - from openfisca_france_data.erf.aggregates import build_erf_aggregates - - assert self.survey_scenario is not None, 'simulation attribute is None' - assert self.variable is not None, 'variable attribute is None' - variable = self.variable - openfisca_aggregates = Aggregates() - openfisca_aggregates.set_survey_scenario(self.survey_scenario) - openfisca_aggregates.compute() - - variables = self.variables - temp = (build_erf_aggregates(variables=[variable], year= self.survey_scenario.year)) - selection = openfisca_aggregates.aggr_frame["Mesure"] == variables[variable].label - print(openfisca_aggregates.aggr_frame[selection]) - print(temp) - # TODO: clean this - return - - def extract(self, data_frame, entities = "men"): - variables = self.variables - filtered_data_frame_columns = list(set(variables.keys()).intersection(set(data_frame.columns))) - extracted_columns = [column_name for column_name in filtered_data_frame_columns - if variables[column_name].entity in entities] - extracted_columns = list(set(extracted_columns).union(set(['idmen']))) - return data_frame[extracted_columns].copy() - - def get_all_parameters(self, column_list): - global x - print([column.name for column in column_list]) - x = x + 1 - if x == 20: - boum - variables = self.variables - tax_benefit_system = self.survey_scenario.simulation.tax_benefit_system - - extractor = input_variables_extractors.setup(tax_benefit_system) - - if len(column_list) == 0: - return [] - else: - column_name = column_list[0].name - print(column_name) - if extractor.get_input_variables(variables[column_name]) is None: - return column_list - else: - first_column = [column_list[0]] - input_columns = self.get_all_parameters([ - variables[clean(parameter)] - for parameter in list(extractor.get_input_variables(variables[column_name])) - ]) - other_columns = list( - set(self.get_all_parameters(column_list[1:])) - set(first_column + input_columns) - ) - print('input_variables: ', [column.name for column in input_columns]) - - print('new_variables: ', [column.name for column in other_columns]) - - new_column_list = first_column + input_columns + other_columns - print('final list: ', [column.name for column in new_column_list]) - return new_column_list - - def build_columns_to_fetch(self): - variables = self.variables -# parameters_column = self.get_all_parameters([variables.get(x) for x in [self.variable]]) -# parameters = [x.name for x in parameters_column] - parameters = [self.variable] - # We want to get all parameters and consumers that we're going to encounter -# consumers = [] -# for variable in [self.variable]: -# column = variables.get(variable) -# consumers = list(set(consumers).union(set(column.consumers))) -# column_names = list(set(parameters).union(set(consumers))) - - # self.columns_to_fetch = column_names - # self.variable_consumers = list(set(consumers)) - self.variable_parameters = list(set(parameters)) - self.columns_to_fetch = list(set(parameters)) - - def build_openfisca_data_frames(self): - column_names = self.columns_to_fetch - for column in column_names: - assert column in survey_scenario.tax_benefit_system.variables.keys() - data_frame_by_entity_key_plural = survey_scenario.create_data_frame_by_entity( - variables = column_names + ['idmen_original'], - indices = True, - roles = True, - ) - self.data_frame_by_entity_key_plural = data_frame_by_entity_key_plural - - projected = self.project_on(data_frame_by_entity_key_plural = data_frame_by_entity_key_plural) - idmen_original_by_idmen = dict( - zip( - data_frame_by_entity_key_plural['menages'].index.values, - data_frame_by_entity_key_plural['menages']["idmen_original"].values - ) - ) - self.idmen_original_by_idmen = idmen_original_by_idmen - - idmen_by_idmen_original = dict( - zip( - data_frame_by_entity_key_plural['menages']["idmen_original"].values, - data_frame_by_entity_key_plural['menages'].index.values, - ) - ) - self.idmen_by_idmen_original = idmen_by_idmen_original - - data_frame_by_entity_key_plural['menages'] = projected.rename( - columns = {"idmen_original": "idmen"}) - data_frame_by_entity_key_plural['individus'].replace( - {'idmen': idmen_original_by_idmen}, inplace = True) - self.data_frame_by_entity_key_plural = data_frame_by_entity_key_plural - - def project_on(self, receiving_entity_key_plural = 'menages', data_frame_by_entity_key_plural = None): - tax_benefit_system = self.survey_scenario.tax_benefit_system - assert data_frame_by_entity_key_plural is not None - assert receiving_entity_key_plural is not tax_benefit_system.person_key_plural - - entity_data_frame = data_frame_by_entity_key_plural[receiving_entity_key_plural] - person_data_frame = data_frame_by_entity_key_plural[tax_benefit_system.person_key_plural] - - entity_keys_plural = list( - set(tax_benefit_system.entity_class_by_key_plural.keys()).difference(set( - [tax_benefit_system.person_key_plural, receiving_entity_key_plural] - )) - ) - - for entity_key_plural in entity_keys_plural: - entity = tax_benefit_system.entity_class_by_key_plural[entity_key_plural] - # Getting only heads of other entities prenent in the projected on entity - boolean_index = person_data_frame[entity.role_for_person_variable_name] == 0 # Heads - index_entity = person_data_frame.loc[boolean_index, entity.index_for_person_variable_name].values # Ent. - for column_name, column_series in self.data_frame_by_entity_key_plural[entity_key_plural].items(): - person_data_frame.loc[boolean_index, column_name] \ - = column_series.iloc[index_entity].values - person_data_frame[column_name].fillna(0) - - receiving_entity = tax_benefit_system.entity_class_by_key_plural[receiving_entity_key_plural] - grouped_data_frame = person_data_frame.groupby(by = receiving_entity.index_for_person_variable_name).agg(sum) - grouped_data_frame.drop(receiving_entity.role_for_person_variable_name, axis = 1, inplace = True) - data_frame = concat([entity_data_frame, grouped_data_frame], axis = 1) - - assert data_frame.notnull().all().all() - return data_frame - - def build_erf_data_frames(self): - # TODO: remove this - self.columns_to_fetch = ['af'] - variables = self.columns_to_fetch - erf_survey_collection = SurveyCollection.load( - collection = "erfs", config_files_directory = config_files_directory) - erf_survey = erf_survey_collection.get_survey("erfs_{}".format(year)) - year_specific_by_generic = year_specific_by_generic_data_frame_name(year) - generic_by_year_specific = dict(zip(year_specific_by_generic.values(), year_specific_by_generic.keys())) - - erf_variables = list(set(variables + ["ident", "wprm", "quelfic", "noi"])) - of2erf = get_of2erf() - for index, variable in enumerate(erf_variables): - if variable in of2erf: - erf_variables[index] = of2erf[variable] - data_frame_by_table = dict(eec_indivi = None, erf_indivi = None, erf_menage = None) - erf_variables_by_generic_table = dict(eec_indivi = [], erf_indivi = [], erf_menage = []) - - year_specific_tables_by_erf_variable = dict( - [ - ( - erf_variable, - set( - erf_survey.find_tables(variable = erf_variable) - ).intersection( - set([year_specific_by_generic[key] for key in erf_variables_by_generic_table.keys()]) - ) - ) for erf_variable in erf_variables - ] - ) - for variable, year_specific_tables in year_specific_tables_by_erf_variable.items(): - if len(year_specific_tables) < 1: - log.info("No tables are present for variable {}".format(variable)) - continue - else: - log.info("Variable {} is present in multiple tables : {}".format(variable, year_specific_tables)) - for table in year_specific_tables: - log.info("Variable {} is retrieved from table {}".format(variable, table)) - erf_variables_by_generic_table[generic_by_year_specific[table]].append(variable) - - erf2of = get_erf2of() - - for table, erf_variables in erf_variables_by_generic_table.items(): - if erf_variables: - data_frame_by_table[table] = erf_survey.get_values( - variables = erf_variables, table = year_specific_by_generic[table] - ) - data_frame_by_table[table].rename(columns = erf2of, inplace = True) - data_frame_by_table[table].rename(columns = {'ident': 'idmen'}, inplace = True) - - assert not data_frame_by_table["erf_menage"].duplicated().any(), "Duplicated idmen in erf_menage" - self.erf_data_frame_by_entity_key_plural = dict( - menages = data_frame_by_table["erf_menage"], - individus = data_frame_by_table["erf_indivi"].merge(data_frame_by_table["eec_indivi"]) - ) - # TODO: fichier foyer - - def get_major_differences(self): - variable = self.variable - - of_menages_data_frame = self.data_frame_by_entity_key_plural['menages'] - erf_menages_data_frame = self.erf_data_frame_by_entity_key_plural['menages'] - - merged_menage_data_frame = merge( - erf_menages_data_frame[[variable, 'idmen']], - of_menages_data_frame[[variable, 'idmen']], - on = 'idmen', - how = 'inner', - suffixes = ('_erf', '_of') - ) - - log.info('Length of merged_menage_data_frame is {}'.format(len(merged_menage_data_frame))) - merged_menage_data_frame.set_index('idmen', drop = False, inplace = True) - table = merged_menage_data_frame[ - numpy.logical_and( - merged_menage_data_frame[variable + '_erf'] != 0, - merged_menage_data_frame[variable + '_of'] != 0 - ) - ] - table[variable + "_rel_diff"] = (table[variable + '_of'] - table[variable + '_erf']) \ - / table[variable + '_erf'] # Difference relative - log.info( - "Minimum difference between the two tables for {} is {}".format( - variable, str(table[variable + "_rel_diff"].min()) - ) - ) - log.info( - "Maximum difference between the two tables for {} is {}".format( - variable, str(table[variable + "_rel_diff"].max()) - ) - ) - table[variable + '_ratio'] = ( - table[variable + '_of'] / table[variable + '_erf'] - ) - log.info(table[variable + "_rel_diff"].describe()) - try: - assert len(table[variable + "_rel_diff"]) == len(table['wprm_of']), "PINAGS" - dec, values = mwp( - table[variable + "_rel_diff"], - numpy.arange(1, 11), table['wprm_of'], - 2, - return_quantiles = True - ) - log.info(sorted(values)) - dec, values = mwp( - table[variable + "_rel_diff"], - numpy.arange(1, 101), - table['wprm_erf'], - 2, - return_quantiles = True - ) - log.info(sorted(values)[90:]) - del dec, values - except Exception: - log.info('Weighted percentile method did not work for {}'.format(variable + "_rel_diff")) - pass - table.sort(columns = variable + "_rel_diff", ascending = False, inplace = True) - - print(table[:10].to_string()) - return table - - def describe_discrepancies(self, fov = 10, consumers = False, parameters = True, descending = True, to_men = False): - variable = self.variable - major_differences_data_frame = self.get_major_differences() - major_differences_data_frame.sort( - columns = self.variable + "_rel_diff", - ascending = not descending, - inplace = True - ) - debug_data_frame = major_differences_data_frame[0:fov].copy() - del major_differences_data_frame - - of_menages_data_frame = self.data_frame_by_entity_key_plural['menages'] - of_individus_data_frame = self.data_frame_by_entity_key_plural['individus'] - erf_individus_data_frame = self.erf_data_frame_by_entity_key_plural['individus'] - erf_menages_data_frame = self.erf_data_frame_by_entity_key_plural['menages'] - return debug_data_frame - - kept_columns = set() - if parameters: - kept_columns.update(set(self.variable_parameters)) - if consumers: - kept_columns.update(set(self.variable_consumers)) - kept_columns = list(kept_columns) - kept_columns = list(set(kept_columns).union( - set(['idmen', 'idfam', 'idfoy', 'quimen', 'quifam', 'quifoy'] + list(major_differences_data_frame.columns))) - ) - - if to_men: - entities_ind = ['ind'] - entities_men = ['men', 'fam', 'foy'] - else: - entities_ind = ['ind', 'fam', 'foy'] - entities_men = ['men'] - - debug_data_frame = debug_data_frame.merge( - self.extract(of_menages_data_frame, entities = entities_men), - how = 'inner', - on = 'idmen', - ) - - print(debug_data_frame.to_string()) - - debug_data_frame = debug_data_frame.merge( - self.extract(of_individus_data_frame, entities = entities_ind), - how = 'inner', - on = 'idmen', - ) - - debug_data_frame = debug_data_frame.merge( - erf_individus_data_frame, - how = 'inner', - on = 'idmen', - ) - - suffixes = ["_erf", "_of", "_rel_diff", "_ratio"] - reordered_columns = [variable + suffixe for suffixe in suffixes] \ - + ["idmen", "quimen", "idfam", "quifam", "idfoy", "quifoy"] - reordered_columns = reordered_columns + list(set(kept_columns) - set(reordered_columns)) - debug_data_frame = debug_data_frame[reordered_columns].copy() - return debug_data_frame - - def generate_test_case(self): - entity_class_by_key_plural = self.survey_scenario.tax_benefit_system.entity_class_by_key_plural - menages_entity = entity_class_by_key_plural['menages'] - idmen_by_idmen_original = self.idmen_by_idmen_original - idmen_original = self.describe_discrepancies(descending = False)[ - menages_entity.index_for_person_variable_name].iloc[0] - idmen = idmen_by_idmen_original[idmen_original] - input_data_frame = self.survey_scenario.input_data_frame - individus_index = input_data_frame.index[ - input_data_frame[menages_entity.index_for_person_variable_name] == idmen] - index_by_entity = { - entity_class_by_key_plural['individus']: individus_index, - } - for entity in list(entity_class_by_key_plural.values()): - if entity.key_plural != 'individus': - index_by_entity[entity] = input_data_frame.loc[ - individus_index, entity.index_for_person_variable_name].unique() - - extracted_indices = individus_index - for entity, entity_index in index_by_entity.items(): - if entity.key_plural in ['menages', 'individus']: - continue - extracted_indices = extracted_indices + \ - input_data_frame.index[input_data_frame[entity.index_for_person_variable_name].isin(entity_index)] - - extracted_input_data_frame = input_data_frame.loc[extracted_indices] - return extracted_input_data_frame - - -if __name__ == '__main__': - import sys - from openfisca_plugin_aggregates.tests.test_aggregates import create_survey_scenario - logging.basicConfig(level = logging.INFO, stream = sys.stdout) - restart = True - if restart: - year = 2009 - survey_scenario = create_survey_scenario(year) - survey_scenario.simulation = survey_scenario.new_simulation() - - debugger = Debugger() - debugger.set_survey_scenario(survey_scenario = survey_scenario) - debugger.set_variable('af') - debugger.build_columns_to_fetch() - debugger.build_openfisca_data_frames() - debugger.build_erf_data_frames() - - # df_menage = debugger.data_frame_by_entity_key_plural['menages'] - # df_famille = debugger.data_frame_by_entity_key_plural['familles'] - # df_individus = debugger.data_frame_by_entity_key_plural['individus'] - - # df = debugger.get_major_differences() - - # debugger.show_aggregates() - df = debugger.describe_discrepancies(descending = False) - df = debugger.generate_test_case() - - boum - entity_class_by_key_plural = debugger.survey_scenario.tax_benefit_system.entity_class_by_key_plural - menages_entity = entity_class_by_key_plural['menages'] - - idmen = debugger.describe_discrepancies(descending = False)[menages_entity.index_for_person_variable_name].iloc[0] - - input_data_frame = debugger.survey_scenario.input_data_frame diff --git a/openfisca_france_data/erfs/scenario.py b/openfisca_france_data/erfs/scenario.py index 19195397..46ad488b 100644 --- a/openfisca_france_data/erfs/scenario.py +++ b/openfisca_france_data/erfs/scenario.py @@ -26,6 +26,7 @@ class ErfsSurveyScenario(AbstractErfsSurveyScenario): 'nbN', 'nbR', 'pensions_alimentaires_percues', + 'primes_fonction_publique', 'rag', 'retraite_brute', 'retraite_imposable', @@ -36,6 +37,7 @@ class ErfsSurveyScenario(AbstractErfsSurveyScenario): 'statut_marital', 'statut_occupation_logement', 'taxe_habitation', + 'traitement_indiciaire_brut', 'zone_apl', ] diff --git a/openfisca_france_data/erfs_fpr/comparison.py b/openfisca_france_data/erfs_fpr/comparison.py index a649f4bf..cfb4e87b 100644 --- a/openfisca_france_data/erfs_fpr/comparison.py +++ b/openfisca_france_data/erfs_fpr/comparison.py @@ -15,15 +15,19 @@ openfisca_by_erfs_fpr_variables = { - "chomage_i": "chomage_net", + "chomage_i": "chomage_imposable", "ident": "idmen_original", "noindiv": "noindiv", - "rag_i": "rag_net", - "retraites_i": "retraite_nette", # TODO: CHECk + "rag_i": "rag", + "retraites_i": "retraite_imposable", # TODO: CHECk "rev_fonciers_bruts": "f4ba", - "ric_i": "ric_net", - "rnc_i": "rnc_net", - "salaires_i": "salaire_net", + "ric_i": "ric", + "rnc_i": "rnc", + "salaires_i": "salaire_imposable", + "logt": "statut_occupation_logement", + "rev_fonciers_bruts": "revenu_categoriel_foncier_menage", + "rev_valeurs_mobilieres_bruts": "revenus_capitaux_prelevement_forfaitaire_unique_ir_menage", + "rev_financier_prelev_lib_imputes": "rev_financier_prelev_lib_imputes_menage", } @@ -31,15 +35,27 @@ class ErfsFprtoInputComparator(AbstractComparator): name = "erfs_fpr" period = None default_target_variables = [ - "chomage_net", - # "rag_net", TODO: does not exist in openfisca - "retraite_nette", - # "ric_net", TODO: does not exist in openfisca - # "rnc_net", TODO: does not exist in openfisca - # "f4ba", - "salaire_net", + "chomage_imposable", + "loyer", + # "rag", + "retraite_imposable", + # "ric", + # "rnc", + "salaire_imposable", + "statut_occupation_logement", ] + from openfisca_france_data.erfs_fpr.get_survey_scenario import menage_projected_variables + + target_menage_projected_variables = [ + f"{menage_projected_variable}_menage" + for menage_projected_variable + in menage_projected_variables + ] + + default_target_variables += target_menage_projected_variables + + def compute_test_dataframes(self): erfs_fpr_survey_collection = SurveyCollection.load(collection = "erfs_fpr") # infer names of the survey and data tables @@ -64,6 +80,9 @@ def compute_test_dataframes(self): "individu": openfisca_individu, "menage": openfisca_menage, } + + fpr_menage.loyer = 12 * fpr_menage.loyer + target_dataframe_by_entity = { "individu": fpr_individu.rename(columns = openfisca_by_erfs_fpr_variables), "menage": fpr_menage.rename(columns = openfisca_by_erfs_fpr_variables), diff --git a/openfisca_france_data/erfs_fpr/get_survey_scenario.py b/openfisca_france_data/erfs_fpr/get_survey_scenario.py index b55b379a..4f8e065c 100644 --- a/openfisca_france_data/erfs_fpr/get_survey_scenario.py +++ b/openfisca_france_data/erfs_fpr/get_survey_scenario.py @@ -2,13 +2,97 @@ from multipledispatch import dispatch # type: ignore +from openfisca_core.model_api import Variable, ADD, YEAR from openfisca_core.reforms import Reform # type: ignore from openfisca_core.taxbenefitsystems import TaxBenefitSystem # type: ignore +from openfisca_france.entities import Individu, FoyerFiscal, Menage from openfisca_france_data.erfs_fpr.scenario import ErfsFprSurveyScenario from openfisca_france_data import france_data_tax_benefit_system +from openfisca_france_data.model.id_variables import ( + idmen_original, + noindiv, + ) + +variables_converted_to_annual = [ + "salaire_imposable", + "chomage_imposable", + "retraite_imposable", + "salaire_net", + "chomage_net", + "retraite_nette", + "ppa", + ] + + +menage_projected_variables = [ + # "rev_financier_prelev_lib_imputes", + "revenu_categoriel_foncier", + "revenus_capitaux_prelevement_forfaitaire_unique_ir", + ] + + +class erfs_fpr_plugin(Reform): + name = "ERFS-FPR ids plugin" + + def apply(self): + + for variable in variables_converted_to_annual: + class_name = f"{variable}_annuel" + label = f"{variable} sur l'année entière" + + def annual_formula_creator(variable): + def formula(individu, period): + result = individu(variable, period, options = [ADD]) + return result + + formula.__name__ = 'formula' + + return formula + + variable_instance = type(class_name, (Variable,), dict( + value_type = float, + entity = self.variables[variable].entity, + label = label, + definition_period = YEAR, + formula = annual_formula_creator(variable), + )) + + self.add_variable(variable_instance) + del variable_instance + + for variable in menage_projected_variables: + class_name = f"{variable}_menage" + label = f"{variable} agrégée à l'échelle du ménage" + + def projection_formula_creator(variable): + def formula(menage, period): + result_i = menage.members.foyer_fiscal(variable, period, options = [ADD]) + result = menage.sum(result_i, role = FoyerFiscal.DECLARANT_PRINCIPAL) + return result + + formula.__name__ = 'formula' + + return formula + + variable_instance = type(class_name, (Variable,), dict( + value_type = float, + entity = Menage, + label = label, + definition_period = YEAR, + formula = projection_formula_creator(variable), + )) + + self.add_variable(variable_instance) + del variable_instance + + + self.add_variable(idmen_original) + self.add_variable(noindiv) + + def get_survey_scenario( year: int = None, rebuild_input_data: bool = False, @@ -40,15 +124,6 @@ def get_survey_scenario( baseline_tax_benefit_system, ) - - from openfisca_france_data.model.id_variables import ( - idmen_original, - noindiv, - ) - - tax_benefit_system.add_variable(idmen_original) - tax_benefit_system.add_variable(noindiv) - if not use_marginal_tax_rate: survey_scenario = ErfsFprSurveyScenario.create( tax_benefit_system = tax_benefit_system, @@ -69,6 +144,7 @@ def get_survey_scenario( # S'il n'y a pas de données, on sait où les trouver. if data is None: input_data_table_by_entity = dict( + foyer_fiscal = f"foyer_fiscal_{year}", individu = f"individu_{year}", menage = f"menage_{year}", ) @@ -78,7 +154,6 @@ def get_survey_scenario( data = dict( input_data_table_by_entity_by_period = input_data_table_by_entity_by_period, - # input_data_survey_prefix = "openfisca_erfs_fpr_data", survey = survey_name ) @@ -114,7 +189,7 @@ def get_tax_benefit_system( tax_benefit_system: None, reform: Reform, ) -> TaxBenefitSystem: - return reform(france_data_tax_benefit_system) + return reform(erfs_fpr_plugin(france_data_tax_benefit_system)) # Appelé quand *tax_benefit_system* et *reform* sont `None` @@ -123,7 +198,7 @@ def get_tax_benefit_system( tax_benefit_system: None, reform: None, ) -> TaxBenefitSystem: - return france_data_tax_benefit_system + return erfs_fpr_plugin(france_data_tax_benefit_system) # Appelé quand *tax_benefit_system* est un :class:`TaxBenefitSystem` @@ -139,4 +214,4 @@ def get_baseline_tax_benefit_system( def get_baseline_tax_benefit_system( tax_benefit_system: None, ) -> TaxBenefitSystem: - return france_data_tax_benefit_system + return erfs_fpr_plugin(france_data_tax_benefit_system) diff --git a/openfisca_france_data/erfs_fpr/input_data_builder/__init__.py b/openfisca_france_data/erfs_fpr/input_data_builder/__init__.py index f2037474..5dc8df4b 100644 --- a/openfisca_france_data/erfs_fpr/input_data_builder/__init__.py +++ b/openfisca_france_data/erfs_fpr/input_data_builder/__init__.py @@ -16,9 +16,11 @@ from openfisca_france_data.erfs_fpr.input_data_builder import ( step_01_preprocessing as preprocessing, # step_02_imputation_loyer as imputation_loyer, + step_02_menage as menage, step_03_variables_individuelles as variables_individuelles, step_04_famille as famille, - step_05_final as final, + step_05_foyer as foyer, + step_06_final as final, ) log = logging.getLogger(__name__) @@ -48,7 +50,7 @@ def build(year: int, export_flattened_df_filepath: str = None) -> None: # - On merge les tables individus / menages # # Note : c'est ici où on objectivise les hypothèses, step 1 - log.info('\n [[[ Year {} - Step 1 / 5 ]]] \n'.format(year)) + log.info('\n [[[ Year {} - Step 1 / 6 ]]] \n'.format(year)) preprocessing.build_merged_dataframes(year = year) # Step 02 : Si on veut calculer les allocations logement, il faut faire le matching avec une autre enquête (ENL) @@ -57,10 +59,11 @@ def build(year: int, export_flattened_df_filepath: str = None) -> None: # stata_directory = openfisca_survey_collection.config.get('data', 'stata_directory') # stata_file = os.path.join(stata_directory, 'log_men_ERFS.dta') # imputation_loyer.merge_imputation_loyer(stata_file = stata_file, year = year) - log.info('\n [[[ Year {} - Step 2 / 5 SKIPPED ]]] \n'.format(year)) + log.info('\n [[[ Year {} - Step 2 / 6 SKIPPED ]]] \n'.format(year)) + menage.build_variables_menage(year = year) # Step 03 : on commence par les variables indivuelles - log.info('\n [[[ Year {} - Step 3 / 5 ]]] \n'.format(year)) + log.info('\n [[[ Year {} - Step 3 / 6 ]]] \n'.format(year)) variables_individuelles.build_variables_individuelles(year = year) # Step 04 : ici on va constituer foyer et famille à partir d'invididu et ménage @@ -69,15 +72,18 @@ def build(year: int, export_flattened_df_filepath: str = None) -> None: # - On va faire des suppositions pour faire les familles # - On va faire les foyers fiscaux à partir des familles # - On va faire de suppositions pour faire les foyers fiscaux - log.info('\n [[[ Year {} - Step 4 / 5 ]]] \n'.format(year)) + log.info('\n [[[ Year {} - Step 4 / 6 ]]] \n'.format(year)) famille.build_famille(year = year) + log.info('\n [[[ Year {} - Step 5 / 6 ]]] \n'.format(year)) + foyer.build_variables_foyers_fiscal(year = year) + # Affreux ! On injectait tout dans un même DataFrame !!! # C'est très moche ! # # On crée une df par entité par période. # Elles sont stockées dans un fichier h5 - log.info('\n [[[ Year {} - Step 5 / 5 ]]] \n'.format(year)) + log.info('\n [[[ Year {} - Step 6 / 6 ]]] \n'.format(year)) final.create_input_data_frame(year = year, export_flattened_df_filepath = export_flattened_df_filepath) diff --git a/openfisca_france_data/erfs_fpr/input_data_builder/step_02_menage.py b/openfisca_france_data/erfs_fpr/input_data_builder/step_02_menage.py new file mode 100644 index 00000000..538f1fc3 --- /dev/null +++ b/openfisca_france_data/erfs_fpr/input_data_builder/step_02_menage.py @@ -0,0 +1,18 @@ +import logging +import pandas as pd + + +from openfisca_survey_manager.temporary import temporary_store_decorator # type: ignore + +log = logging.getLogger(__name__) + + +@temporary_store_decorator(file_name = 'erfs_fpr') +def build_variables_menage(temporary_store = None, year = None): + if year >= 2018: + menages = temporary_store['menages_{}'.format(year)] + menages['zone_apl'] = 2 + # pour l'instant on met tout le monde à 2 mais à améliorer, peut être en fonction de la taille de l'aire urbaine ? + menages['statut_occupation_logement'] = menages['so'].copy() + menages.loc[(menages.statut_occupation_logement == 7),'statut_occupation_logement'] = 2 + temporary_store['menages_{}'.format(year)] = menages diff --git a/openfisca_france_data/erfs_fpr/input_data_builder/step_03_variables_individuelles.py b/openfisca_france_data/erfs_fpr/input_data_builder/step_03_variables_individuelles.py index 7b36012a..92364169 100644 --- a/openfisca_france_data/erfs_fpr/input_data_builder/step_03_variables_individuelles.py +++ b/openfisca_france_data/erfs_fpr/input_data_builder/step_03_variables_individuelles.py @@ -7,7 +7,6 @@ from openfisca_france_data.common import ( create_salaire_de_base, create_traitement_indiciaire_brut, - create_revenus_remplacement_bruts, ) from openfisca_france_data import openfisca_france_tax_benefit_system from openfisca_france_data.smic import ( @@ -30,13 +29,13 @@ def build_variables_individuelles(temporary_store = None, year = None): individus = temporary_store['individus_{}_post_01'.format(year)] openfisca_by_erfs_variable = { - 'chomage_i': 'chomage_net', + 'chomage_i': 'chomage_imposable', 'pens_alim_recue_i': 'pensions_alimentaires_percues', 'rag_i': 'rag_net', - 'retraites_i': 'retraite_nette', + 'retraites_i': 'retraite_imposable', 'ric_i': 'ric_net', 'rnc_i': 'rnc_net', - 'salaires_i': 'salaire_net', + 'salaires_i': 'salaire_imposable', } for variable in openfisca_by_erfs_variable.keys(): @@ -48,104 +47,61 @@ def build_variables_individuelles(temporary_store = None, year = None): ) create_variables_individuelles(individus, year) assert 'salaire_de_base' in individus.columns , 'salaire de base not in individus' + assert 'traitement_indiciaire_brut' in individus.columns , 'traitement indiciaire brut not in individus' + assert 'primes_fonction_publique' in individus.columns , 'primes fonction publique not in individus' temporary_store['individus_{}'.format(year)] = individus return individus # helpers -def create_variables_individuelles(individus, year, survey_year = None): - """Création des variables individuelles""" +def create_variables_individuelles(individus, year, survey_year = None, revenu_type = 'imposable'): + """Création des variables individuelles.""" + period = periods.period(year) + tax_benefit_system = openfisca_france_tax_benefit_system + # variables démographiques create_ages(individus, year) create_date_naissance(individus, age_variable = None, annee_naissance_variable = 'naia', mois_naissance = 'naim', year = year) + # Base pour constituer les familles, foyers, etc. + create_statut_matrimonial(individus) + + # variable d'activite create_activite(individus) - revenu_type = 'net' - period = periods.period(year) - create_revenus(individus, revenu_type = revenu_type) create_contrat_de_travail(individus, period = period, salaire_type = revenu_type) create_categorie_salarie(individus, period = period, survey_year = survey_year) - # Il faut que la base d'input se fasse au millésime des données - # On fait ça car, aussi bien le TaxBenefitSystem et celui réformé peuvent être des réformes - # Par exemple : si je veux calculer le diff entre le PLF2019 et un ammendement, - # je besoin d'un droit courant comme même du droit courrant pour l'année des données - tax_benefit_system = openfisca_france_tax_benefit_system - - # On n'a pas le salaire brut mais le salaire net ou imposable, on doit l'inverser + # inversion des revenus pour retrouver le brut + # pour les revenus de remplacement on a la csg et la crds dans l'erfs-fpr donc on peut avoir le brut directement + create_revenus_remplacement_bruts(individus) + # On n'a pas le salaire et le traitement_indiciaire brut, on doit l'inverser + # comme on a la crds et la csg non déductible on recalcule l'imposable puis on inverse l'imposable pour avoir le brut + #individus['salaire_imposable'] = individus.salaire_net + individus.csg_nd_crds_sal_i create_salaire_de_base( individus, period = period, revenu_type = revenu_type, - tax_benefit_system = tax_benefit_system + tax_benefit_system = tax_benefit_system, + ) + create_traitement_indiciaire_brut( + individus, + period = period, + revenu_type = revenu_type, + tax_benefit_system = tax_benefit_system, ) + create_traitement_indiciaire_brut( + individus, + period = period, + revenu_type = revenu_type, + tax_benefit_system = tax_benefit_system) # Pour les cotisations patronales qui varient avec la taille de l'entreprise' create_effectif_entreprise(individus, period = period, survey_year = survey_year) - # Base pour constituer les familles, foyers, etc. - create_statut_matrimonial(individus) - assert 'salaire_de_base' in individus.columns , 'salaire de base not in individus' return individus -def create_individu_variables_brutes(individus, revenu_type = None, period = None, - tax_benefit_system = None, mass_by_categorie_salarie = None, - calibration_eec = False): - """ - Crée les variables brutes de revenus: - - salaire_de_base - - traitement_indiciaire_brut - - primes_fonction_publique - - retraite_bruite - - chomage_brut - à partir des valeurs nettes ou imposables de ces revenus - et d'autres information individuelles - """ - assert revenu_type in ['imposable', 'net'] - assert period is not None - assert tax_benefit_system is not None - - assert 'age' in individus.columns - - created_variables = [] - create_contrat_de_travail(individus, period = period, salaire_type = revenu_type) - created_variables.append('contrat_de_travail') - created_variables.append('heures_remunerees_volume') - - create_categorie_salarie(individus, period = period) - created_variables.append('categorie_salarie') - create_categorie_non_salarie(individus) - created_variables.append('categorie_non_salarie') - - # FIXME: categorie_non_salarie modifie aussi categorie_salarie !! - if (mass_by_categorie_salarie is not None) & (calibration_eec is True): - calibrate_categorie_salarie(individus, year = None, mass_by_categorie_salarie = mass_by_categorie_salarie) - - create_salaire_de_base(individus, period = period, revenu_type = revenu_type, tax_benefit_system = tax_benefit_system) - created_variables.append('salaire_de_base') - - create_effectif_entreprise(individus, period = period) - created_variables.append('effectif_entreprise') - - create_traitement_indiciaire_brut(individus, period = period, revenu_type = revenu_type, - tax_benefit_system = tax_benefit_system) - created_variables.append('traitement_indiciaire_brut') - created_variables.append('primes_fonction_publique') - - create_taux_csg_remplacement(individus, period, tax_benefit_system) - created_variables.append('taux_csg_remplacement') - created_variables.append('taux_csg_remplacement_n_1') - created_variables.append('rfr_special_csg_n') - created_variables.append('rfr_special_csg_n_1') - - create_revenus_remplacement_bruts(individus, period, tax_benefit_system) - created_variables.append('chomage_brut') - created_variables.append('retraite_brute') - return created_variables - - def create_activite(individus): """Création de la variable activite. @@ -540,31 +496,6 @@ def create_categorie_non_salarie(individus): profession_liberale, 'categorie_non_salarie' ] = 3 - # Correction fonction publique - individus.loc[ - ( - (individus.categorie_salarie == 0) - & (individus.cstot.isin([31, 33, 34, 35, 37, 38,])) - ), - 'categorie_salarie' - ] = 1 - - # Correction encadrement - individus.loc[ - ( - (individus.categorie_salarie == 0) - & (individus.cstot.isin([31, 34, 35, 37 ,38])) # Cadres hors FP - ), - 'categorie_salarie' - ] = 1 - # Correction fonction publique - individus.loc[ - ( - (individus.categorie_salarie.isin([0, 1, 7])) - & (individus.cstot == 53) # Policiers et militaires reversé dans titulaire état - ), - 'categorie_salarie' - ] = 2 def create_contrat_de_travail(individus, period, salaire_type = 'imposable'): @@ -971,74 +902,14 @@ def create_effectif_entreprise(individus, period = None, survey_year = None): individus.effectif_entreprise.value_counts(dropna = False))) -def create_revenus(individus, revenu_type = 'imposable'): - """Création des plusieurs variablesde revenu. - - Ces variables sont: - chomage_net, - pensions_alimentaires_percues, - rag_net, - retraite_nette, - ric_net, - rnc_net, - et éventuellement, si revenus_type = 'imposable' des variables: - chomage_imposable, - rag, - retraite_imposable, - ric, - rnc, - salaire_imposable, +def create_revenus_remplacement_bruts(individus): + """ + Reconstitution des variables de retraite et chomage brut à partir des variables nettes et des variables de csg et crds """ - individus['chomage_brut'] = individus.csgchod_i + individus.chomage_net - individus['retraite_brute'] = individus.csgrstd_i + individus.retraite_nette - - if revenu_type == 'imposable': - variables = [ - # 'pension_alimentaires_percues', - 'chomage_imposable', - 'retraite_imposable', - ] - for variable in variables: - assert variable in individus.columns.tolist(), "La variable {} n'est pas présente".format(variable) - - for variable in variables: - if (individus[variable] < 0).any(): - - negatives_values = individus[variable].value_counts().loc[individus[variable].value_counts().index < 0] - log.debug("La variable {} contient {} valeurs négatives\n {}".format( - variable, - negatives_values.sum(), - negatives_values, - ) - ) - # csg des revenus de replacement - # 0 - Non renseigné/non pertinent - # 1 - Exonéré - # 2 - Taux réduit - # 3 - Taux plein - taux = pd.concat( - [ - individus.csgrstd_i / individus.retraite_brute, - individus.csgchod_i / individus.chomage_brut, - ], - axis = 1 - ).max(axis = 1) - - # taux.loc[(0 < taux) & (taux < .1)].hist(bins = 100) - individus['taux_csg_remplacement'] = np.select( - [ - taux.isnull(), - taux.notnull() & (taux < 0.021), - taux.notnull() & (taux > 0.021) & (taux < 0.0407), - taux.notnull() & (taux > 0.0407) - ], - [0, 1, 2, 3] - ) - for value in [0, 1, 2, 3]: - assert (individus.taux_csg_remplacement == value).any(), \ - "taux_csg_remplacement ne prend jamais la valeur {}".format(value) - assert individus.taux_csg_remplacement.isin(range(4)).all() + # revenu_brut = revenu_net + csg_deductible + csg_non_deductible_crds + individus['chomage_brut'] = individus.chomage_imposable + individus.csgchod_i #+ individus. csg_nd_crds_cho_i + individus['retraite_brute'] = individus.retraite_imposable + individus.csgrstd_i #+ individus.csg_nd_crds_ret_i def create_statut_matrimonial(individus): diff --git a/openfisca_france_data/erfs_fpr/input_data_builder/step_05_foyer.py b/openfisca_france_data/erfs_fpr/input_data_builder/step_05_foyer.py new file mode 100644 index 00000000..21683d06 --- /dev/null +++ b/openfisca_france_data/erfs_fpr/input_data_builder/step_05_foyer.py @@ -0,0 +1,57 @@ +import logging +import pandas as pd + + +from openfisca_survey_manager.temporary import temporary_store_decorator # type: ignore + + +log = logging.getLogger(__name__) + + +@temporary_store_decorator(file_name = 'erfs_fpr') +def build_variables_foyers_fiscal(temporary_store = None, year = None): + + assert temporary_store is not None + assert year is not None + + individus = temporary_store['individus_{}'.format(year)] + menages = temporary_store['menages_{}'.format(year)] + + individus['idfoy'] = individus['idfam'].copy() + individus['quifoy'] = individus['quifam'].copy() + + foyers_fiscaux = individus[['idfoy','ident',]].drop_duplicates() + foyers_fiscaux = pd.merge( + menages[[ + 'ident', + 'rev_financier_prelev_lib_imputes', + 'rev_fonciers_bruts', + 'rev_valeurs_mobilieres_bruts', + 'wprm', + ]], + foyers_fiscaux, + how = 'inner', + on = 'ident' + ) + # première version pour splitter les revenus du capital du ménage dans les foyers fiscaux + # on attribue l'ensemble des revenus du capital du ménage au foyer avec la personne ayant les plus hauts revenus + # procédure à améliorer + idfoy = (individus + .sort_values( + [ + 'ident', + 'salaire_de_base', + 'traitement_indiciaire_brut', + 'retraite_brute' + ], + ascending = False + ) + .groupby('ident') + .first() + .idfoy + ) + foyers_fiscaux['revenu_categoriel_foncier'] = foyers_fiscaux['rev_fonciers_bruts'] * foyers_fiscaux.idfoy.isin(idfoy) + foyers_fiscaux['revenus_capitaux_prelevement_forfaitaire_unique_ir'] = foyers_fiscaux['rev_valeurs_mobilieres_bruts'] * foyers_fiscaux.idfoy.isin(idfoy) + foyers_fiscaux['rev_financier_prelev_lib_imputes'] = foyers_fiscaux['rev_financier_prelev_lib_imputes'] * foyers_fiscaux.idfoy.isin(idfoy) + + temporary_store[f"foyers_fiscaux_{year}"] = foyers_fiscaux diff --git a/openfisca_france_data/erfs_fpr/input_data_builder/step_05_final.py b/openfisca_france_data/erfs_fpr/input_data_builder/step_06_final.py similarity index 83% rename from openfisca_france_data/erfs_fpr/input_data_builder/step_05_final.py rename to openfisca_france_data/erfs_fpr/input_data_builder/step_06_final.py index 1d3e0abe..582e296f 100644 --- a/openfisca_france_data/erfs_fpr/input_data_builder/step_05_final.py +++ b/openfisca_france_data/erfs_fpr/input_data_builder/step_06_final.py @@ -16,9 +16,10 @@ def create_input_data_frame(temporary_store = None, year = None, export_flattene individus = temporary_store['individus_{}'.format(year)] menages = temporary_store['menages_{}'.format(year)] + foyers_fiscaux = temporary_store['foyers_fiscaux_{}'.format(year)] # ici : variables à garder - variables = [ + var_individus = [ 'activite', 'age', 'categorie_salarie', @@ -39,12 +40,27 @@ def create_input_data_frame(temporary_store = None, year = None, export_flattene 'retraite_brute', 'ric', 'rnc', - 'statut_marital', - # 'salaire_imposable', 'salaire_de_base', - 'taux_csg_remplacement', + 'statut_marital', + "primes_fonction_publique", + "traitement_indiciaire_brut", + ] + var_foyers_fiscaux = [ + 'idfoy', + 'rev_financier_prelev_lib_imputes', + 'revenu_categoriel_foncier', + 'revenus_capitaux_prelevement_forfaitaire_unique_ir', ] + var_menages = [ + 'idmen', + 'loyer', + 'statut_occupation_logement', + 'taxe_habitation', + 'wprm', + 'zone_apl', + ] + # TODO: fix this simplistic inference individus.rename(columns = { 'ric_net': 'ric', @@ -55,7 +71,7 @@ def create_input_data_frame(temporary_store = None, year = None, export_flattene ) individus = create_ids_and_roles(individus) - individus = individus[variables].copy() + individus = individus[var_individus].copy() gc.collect() # This looks like it could have a sizeable impact @@ -64,13 +80,13 @@ def create_input_data_frame(temporary_store = None, year = None, export_flattene menages[k] = 0 # Again artificially putting missing variables in their default state - menages["loyer"] = 0 - menages["zone_apl"] = 2 - menages["statut_occupation_logement"] = 0 + #menages["loyer"] = 0 + #menages["zone_apl"] = 2 + #menages["statut_occupation_logement"] = 0 menages = extract_menages_variables(menages) - individus = create_collectives_foyer_variables(individus, menages) + # individus = create_collectives_foyer_variables(individus, menages) idmens = individus.idmen.unique() menages = menages.loc[ @@ -91,11 +107,20 @@ def create_input_data_frame(temporary_store = None, year = None, export_flattene menages = menages.rename(columns = {'idmen':'idmen_original'}) unique_idmen = individus[['idmen','idmen_original']].drop_duplicates() assert len(unique_idmen) == len(menages), "Number of idmen should be the same individus and menages tables." - menages = menages.merge( - unique_idmen, - how = 'inner', - on = 'idmen_original' - ) + + menages = menages.merge(unique_idmen, + how = 'inner', + on = 'idmen_original') + + foyers_fiscaux = foyers_fiscaux.rename(columns = {'idfoy':'idfoy_original'}) + unique_idfoy = individus[['idfoy','idfoy_original']].drop_duplicates() + assert len(unique_idmen) == len(menages), "Number of idfoy should be the same individus and foyers tables." + + foyers_fiscaux = foyers_fiscaux.merge(unique_idfoy, + how = 'inner', + on = 'idfoy_original') + + foyers_fiscaux = foyers_fiscaux[var_foyers_fiscaux] if export_flattened_df_filepath: supermerge = individus.merge( @@ -116,6 +141,18 @@ def create_input_data_frame(temporary_store = None, year = None, export_flattene collection = "openfisca_erfs_fpr", survey_name = survey_name, ) + + foyers_fiscaux = foyers_fiscaux.sort_values(by = ['idfoy']) + log.debug(f"Saving entity 'foyers fiscaux' in collection 'openfisca_erfs_fpr' and survey name '{survey_name}' with set_table_in_survey") + set_table_in_survey( + foyers_fiscaux, + entity = "foyer_fiscal", + period = year, + collection = "openfisca_erfs_fpr", + survey_name = survey_name, + ) + log.debug("End of create_input_data_frame") + menages = menages.sort_values(by = ['idmen']) log.debug(f"Saving entity 'menage' in collection 'openfisca_erfs_fpr' and survey name '{survey_name}' with set_table_in_survey") set_table_in_survey( @@ -127,6 +164,7 @@ def create_input_data_frame(temporary_store = None, year = None, export_flattene ) log.debug("End of create_input_data_frame") + def create_collectives_foyer_variables(individus, menages): menages_revenus_fonciers = menages[['idmen', 'rev_fonciers_bruts']].copy() idmens = menages_revenus_fonciers.query('(rev_fonciers_bruts > 0)')['idmen'].tolist() @@ -182,7 +220,7 @@ def create_collectives_foyer_variables(individus, menages): assert set(foyers_revenus_fonciers.columns) == set(['idfoy', 'rev_fonciers_bruts', 'quifoy']) individus = individus.merge(foyers_revenus_fonciers, how = 'outer', on = ['idfoy', 'quifoy']) assert set(idmens) == set(individus .query('(rev_fonciers_bruts > 0)')['idmen'].tolist()) - individus.rename(columns = {'rev_fonciers_bruts': 'f4ba'}, inplace = True) + individus.rename(columns = {'rev_fonciers_bruts': 'revenu_categoriel_foncier'}, inplace = True) return individus @@ -227,16 +265,13 @@ def extract_menages_variables_from_store(temporary_store = None, year = None): def extract_menages_variables(menages): - variables = ['ident', 'wprm', 'taxe_habitation', 'rev_fonciers_bruts'] + variables = ['ident', 'wprm', 'taxe_habitation'] external_variables = ['loyer', 'zone_apl', 'statut_occupation_logement'] for external_variable in external_variables: if external_variable in menages.columns: log.debug("Found {} in menages table: we keep it".format(external_variable)) variables.append(external_variable) - # TODO: 2007-2010 ont la variable rev_fonciers et non pas rev_fonciers_bruts. Est-ce la même? - menages = menages.rename(columns={'rev_fonciers': 'rev_fonciers_bruts'}) menages = menages[variables].copy() - menages.taxe_habitation = - menages.taxe_habitation # taxes should be negative menages.rename(columns = dict(ident = 'idmen'), inplace = True) return menages diff --git a/openfisca_france_data/erfs_fpr/scenario.py b/openfisca_france_data/erfs_fpr/scenario.py index a72a0d9d..7f81adea 100644 --- a/openfisca_france_data/erfs_fpr/scenario.py +++ b/openfisca_france_data/erfs_fpr/scenario.py @@ -25,10 +25,13 @@ class ErfsFprSurveyScenario(AbstractErfsSurveyScenario): "rag", "retraite_brute", "retraite_imposable", + # "rev_financier_prelev_lib_imputes", + "revenu_categoriel_foncier", + "revenus_capitaux_prelevement_forfaitaire_unique_ir", "ric", "rnc", - "statut_marital", "salaire_de_base", + "statut_marital", "statut_occupation_logement", "taxe_habitation", "traitement_indiciaire_brut", diff --git a/openfisca_france_data/erfs_fpr/test_case_creation.py b/openfisca_france_data/erfs_fpr/test_case_creation.py new file mode 100644 index 00000000..b2572da1 --- /dev/null +++ b/openfisca_france_data/erfs_fpr/test_case_creation.py @@ -0,0 +1,146 @@ +import click +import ipdb as pdb +import logging +import pandas as pd +import sys +import yaml + +from pandas.api.types import is_datetime64_any_dtype as is_datetime +from openfisca_core.model_api import ETERNITY + + +from openfisca_france_data import france_data_tax_benefit_system +from openfisca_france_data.erfs_fpr import original_id_by_entity +from openfisca_france_data.erfs_fpr.scenario import ErfsFprSurveyScenario +from openfisca_france_data.erfs_fpr.comparison import ErfsFprtoInputComparator +from openfisca_france_data.erfs_fpr.get_survey_scenario import variables_converted_to_annual + + +tax_benefit_system = france_data_tax_benefit_system +openfisca_variables_by_entity = dict( + ( + entity.key, + [variable_name for variable_name, variable in tax_benefit_system.variables.items() if variable.entity.key == entity.key], + ) + for entity in tax_benefit_system.entities + ) + +id_variable_by_entity_key = ErfsFprSurveyScenario.id_variable_by_entity_key +weight_variable_by_entity = ErfsFprSurveyScenario.weight_variable_by_entity + + +renaming_variables_to_annual = { + monthly_variable: f"{monthly_variable}_annuel" + for monthly_variable in variables_converted_to_annual + } + +def build_test(period, noindiv, target_variables = None): + if target_variables is None: + target_variables = ErfsFprtoInputComparator.default_target_variables + + comparator = ErfsFprtoInputComparator() + comparator.period = period + input_dataframe_by_entity, target_dataframe_by_entity = comparator.get_test_dataframes(rebuild = True, noindivs = [noindiv]) + + def convert_date_to_sting(dataframe): + date_columns = list(dataframe.select_dtypes(include=["datetime64"])) + dataframe[date_columns] = dataframe[date_columns].astype(str) + + def remove_non_openfisca_columns(dataframe): + openfisca_variables = set(sum([list(value) for value in openfisca_variables_by_entity.values()], [])).union(set(["noindiv", "idmen_original"])) + selected_variables = list(set(dataframe.columns).intersection(openfisca_variables)) + return dataframe[selected_variables] + + def build_test_dict(dataframe_by_entity, renaming_variables_to_annual = None): + input_by_entity = dict() + for entity, dataframe in dataframe_by_entity.items(): + convert_date_to_sting(dataframe) + identifier = "noindiv" if entity == "individu" else "idmen_original" + entity_plural = "individus" if entity == "individu" else "menages" + input_by_entity[entity_plural] = input = dict() + dataframe[identifier] = "id_" + dataframe[identifier].astype(str) + df = remove_non_openfisca_columns(dataframe).set_index(identifier) + for row, series in df.iterrows(): + series.drop( + ( + list(weight_variable_by_entity.values()) + + list(id_variable_by_entity_key.values()) + + list(original_id_by_entity.values()) + ), + inplace = True, + errors = "ignore", + ) + if renaming_variables_to_annual: + series.rename(renaming_variables_to_annual, inplace = True) + input[row] = series.dropna().to_dict() + + return input_by_entity + + input_by_entity = build_test_dict(input_dataframe_by_entity) + output_by_entity = build_test_dict(target_dataframe_by_entity, renaming_variables_to_annual) + + relative_error_margin = { + "default": 5e-3, + } + test = dict( + name = f"Observation {noindiv} on {period}", + reforms = "openfisca_france_data.erfs_fpr.get_survey_scenario.erfs_fpr_plugin", + max_spiral_loops = 4, + relative_error_margin = relative_error_margin, + period = period, + input = input_by_entity, + output = output_by_entity, + ) + return test + + + +def export_test_file(period, noindiv, test_case_name = None): + """ + Export a erfs-fpr input and output to an OpenFisca test case. + + Args: + period (int): simulation year + noindiv (int): individu id number + test_case_name (str, optional): _description_. Defaults to Name of the test case file. Defaults to 'test_case_erfs_fpr_NOINDIV'. + """ + if test_case_name is None: + test_case_name = f"test_case_erfs_fpr_{noindiv}" + + test_case_file_path = f'{test_case_name}.yaml' + test = build_test(period, noindiv) + + with open(test_case_file_path, 'w') as file: + yaml.dump(test, file, sort_keys=False) + + text = get_erfs_fpr_data_as_comment(noindiv) + + with open(test_case_file_path, "a+") as file: + _ = file.read() # this auto closes the file after reading, which is a good practice + file.write(text) + + +def get_erfs_fpr_data_as_comment(noind): + return "# Blabal" + + +@click.command() +@click.option('-n', '--noindiv', type = int, help = "Individual id number", required = True) +@click.option('-v', '--verbose', is_flag = True, default = False, help = "Increase output verbosity", show_default = True) +@click.option('-d', '--debug', is_flag = True, default = False, help = "Use python debugger", show_default = True) +def create_test(noindiv = 0, verbose = False, debug = False): + """Create test case for a specific ERFS FPR individual.""" + assert noindiv != 0, "Provide valid individual" + logging.basicConfig(level = logging.DEBUG if verbose else logging.WARNING, stream = sys.stdout) + from openfisca_france_data.erfs_fpr import REFERENCE_YEAR + period = REFERENCE_YEAR + try: + export_test_file(period, noindiv) + except Exception as e: + if debug: + pdb.post_mortem(sys.exc_info()[2]) + raise e + + +if __name__ == "__main__": + sys.exit(create_test()) diff --git a/openfisca_france_data/felin/__init__.py b/openfisca_france_data/felin/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/openfisca_france_data/felin/input_data_builder/create_variables_individuelles.py b/openfisca_france_data/felin/input_data_builder/create_variables_individuelles.py index 16e80f13..09d217a4 100644 --- a/openfisca_france_data/felin/input_data_builder/create_variables_individuelles.py +++ b/openfisca_france_data/felin/input_data_builder/create_variables_individuelles.py @@ -19,12 +19,12 @@ def create_taux_csg_remplacement(individus, period, tax_benefit_system, sigma = def compute_taux_csg_remplacement(rfr, nbptr): parameters = tax_benefit_system.get_parameters_at_instant(period.start) - seuils = parameters.prelevements_sociaux.contributions_sociales.csg.remplacement.pensions_de_retraite_et_d_invalidite - seuil_exoneration = seuils.seuil_de_rfr_1 + (nbptr - 1) * seuils.demi_part_suppl - seuil_reduction = seuils.seuil_de_rfr_2 + (nbptr - 1) * seuils.demi_part_suppl + seuils = parameters.prelevements_sociaux.contributions_sociales.csg.remplacement.seuils + seuil_exoneration = seuils.seuil_rfr1.seuil_rfr1 + (nbptr - 1)*2 * seuils.seuil_rfr1.demi_part_suppl_rfr1 + seuil_reduction = seuils.seuil_rfr2.seuil_rfr2 + (nbptr - 1)*2 * seuils.seuil_rfr2.demi_part_suppl_rfr2 taux_csg_remplacement = 0.0 * rfr if period.start.year >= 2019: - seuil_taux_intermediaire = seuils.seuil_rfr3 + (nbptr - 1) * seuils.demi_part_suppl_rfr3 + seuil_taux_intermediaire = seuils.seuil_rfr3.seuil_rfr3 + (nbptr - 1)*2 * seuils.seuil_rfr3.demi_part_suppl_rfr3 taux_csg_remplacement = np.where( rfr <= seuil_exoneration, 1, diff --git a/openfisca_france_data/model/base.py b/openfisca_france_data/model/base.py index a954ca0e..cc508420 100644 --- a/openfisca_france_data/model/base.py +++ b/openfisca_france_data/model/base.py @@ -1,6 +1,3 @@ -from datetime import date - - from openfisca_france.model.base import * diff --git a/openfisca_france_data/smic.py b/openfisca_france_data/smic.py index 000532f2..26e864f8 100644 --- a/openfisca_france_data/smic.py +++ b/openfisca_france_data/smic.py @@ -130,7 +130,7 @@ # this collects the data from openfisca-france/openfisca_france/parameters/marche_travail/salaire_minimum/smic/smic_b_horaire.yaml ? # if year < 1970: log.warning('SMIC before 1970 (SMIG) depends on zone. Which one to use is unclear. TBD.') # else : openfisca_france\parameters\marche_travail\salaire_minimum\smic\smic_b_horaire.yaml - smic_horaire_brut[year] = openfisca_france_tax_benefit_system.get_parameters_at_instant(instant = periods.period(year).start).marche_travail.salaire_minimum.smic.smic_b_horaire + smic_horaire_brut[year] = openfisca_france_tax_benefit_system.parameters(periods.period(year).start).marche_travail.salaire_minimum.smic.smic_b_horaire except: continue @@ -142,7 +142,7 @@ def smic_annuel_imposable_from_net(year, smic_hor_brut): try: # TODO: the formula is not 100 % flexible, I have hard-coded the 4 PSS cut-off; this could be improved in the future # then again, it seems not to be used at all for OFF-ERFS, just the smic_horaire_brut - params = openfisca_france_tax_benefit_system.get_parameters_at_instant(instant = periods.period(year).start) + params = openfisca_france_tax_benefit_system.parameters(instant = periods.period(year).start) smic_net = smic_annuel_net_by_year[year] working_hours = params.marche_travail.salaire_minimum.smic.nb_heures_travail_mensuel smic_brut = smic_hor_brut * working_hours * 12 @@ -176,13 +176,13 @@ def smic_annuel_imposable_from_net(year, smic_hor_brut): smic_horaire_brut_by_year = dict([ - (year, openfisca_france_tax_benefit_system.get_parameters_at_instant(instant = periods.period(year).start).marche_travail.salaire_minimum.smic.smic_b_horaire) + (year, openfisca_france_tax_benefit_system.parameters(instant = periods.period(year).start).marche_travail.salaire_minimum.smic.smic_b_horaire) for year in range(start_year, end_year) ]) smic_annuel_brut_by_year = dict([ (year, - smic_horaire_brut_by_year[year] * openfisca_france_tax_benefit_system.get_parameters_at_instant(instant = periods.period(year).start).marche_travail.salaire_minimum.smic.nb_heures_travail_mensuel * 12) + smic_horaire_brut_by_year[year] * openfisca_france_tax_benefit_system.parameters(instant = periods.period(year).start).marche_travail.salaire_minimum.smic.nb_heures_travail_mensuel * 12) for year in range(start_year, end_year) ]) \ No newline at end of file diff --git a/openfisca_france_data/surveys.py b/openfisca_france_data/surveys.py index b2463c60..0ccc5c58 100644 --- a/openfisca_france_data/surveys.py +++ b/openfisca_france_data/surveys.py @@ -146,11 +146,13 @@ def custom_initialize(self, simulation): ] computed_variables_used_as_input = [ - # 'chomage_brut', "chomage_imposable", + "primes_fonction_publique", "retraite_brute", "retraite_imposable", "salaire_de_base", + "traitement_indiciaire_brut", + # 'chomage_brut', ] three_year_span_variables = input_variables + computed_variables_used_as_input @@ -194,13 +196,6 @@ def custom_input_data_frame(self, input_data_frame, **kwargs): if "loyer" in input_data_frame: input_data_frame["loyer"] = 12 * input_data_frame.loyer - if "categorie_salarie" in input_data_frame: - log.debug("Setting categorie_salarie to 0 and 1 only") - input_data_frame.loc[ - input_data_frame.categorie_salarie.isin(range(2, 7)), - "categorie_salarie", - ] = 1 - for variable in ["quifam", "quifoy", "quimen"]: if variable in input_data_frame: log.debug(input_data_frame[variable].value_counts(dropna = False)) diff --git a/setup.py b/setup.py index 162783e4..99ee1db7 100644 --- a/setup.py +++ b/setup.py @@ -35,8 +35,10 @@ "click >= 8.0.0, < 9.0.0", "matplotlib >= 3.1.1, < 4.0.0", "multipledispatch >= 0.6.0, < 1.0.0", - "openFisca-france >= 145.0.0, < 146.0.0", - "openFisca-survey-manager >= 0.47.2, < 1.0.0", + # "openFisca-france >= 145.0.0, < 146.0.0", + "OpenFisca-France @ git+https://github.com/openfisca/openfisca-france.git@version_leap", + # "openFisca-survey-manager >= 0.47.2, < 1.0.0", + "OpenFisca-Survey-Manager @ git+https://github.com/openfisca/openfisca-survey-manager.git@version_leap", "wquantiles >= 0.3.0, < 1.0.0", # To compute weighted quantiles ], extras_require = { @@ -45,10 +47,11 @@ "bumpver >= 2022.1120", "dtale", "flake8 >= 3.7.0, < 3.8.0", + "ipdb >=0.13, <1.0", "ipython >= 7.5.0, < 8.0.0", "mypy >= 0.670, < 1.0.0", "pypandoc", - 'pytest >= 5.0.0, < 7.0.0', + # "pytest", # "pytest-cov >= 2.6.0, < 3.0.0", "scipy >= 1.2.1, < 2.0.0", "toolz >= 0.9.0, < 1.0.0", diff --git a/tests/erfs_fpr/integration/test_aggregates.py b/tests/erfs_fpr/integration/test_aggregates.py index 479888f2..3473bc70 100644 --- a/tests/erfs_fpr/integration/test_aggregates.py +++ b/tests/erfs_fpr/integration/test_aggregates.py @@ -10,7 +10,9 @@ import os + from openfisca_france_data import france_data_tax_benefit_system +from openfisca_france_data.erfs_fpr import REFERENCE_YEAR from openfisca_france_data.erfs_fpr.get_survey_scenario import get_survey_scenario from openfisca_france_data.aggregates import FranceAggregates as Aggregates @@ -76,7 +78,7 @@ def test_erfs_fpr_aggregates_reform(): @click.command() -@click.option('-y', '--year', 'year', default = 2018, help = "ERFS-FPR year", show_default = True, +@click.option('-y', '--year', 'year', default = REFERENCE_YEAR, help = "ERFS-FPR year", show_default = True, type = int, required = True) @click.option('-c', '--configfile', default = None, help = 'raw_data.ini path to read years to process.', show_default = True) diff --git a/tests/fixtures/formulas/af.yaml b/tests/fixtures/formulas/af.yaml index b6e90924..6c75f436 100644 --- a/tests/fixtures/formulas/af.yaml +++ b/tests/fixtures/formulas/af.yaml @@ -28,7 +28,7 @@ - id: "enfant2" age_en_mois: 2015-01: 9 * 12 - output_variables: + output: autonomie_financiere: 2015-01: - false diff --git a/tests/inversion/remplacement_2021.yaml b/tests/inversion/remplacement_2021.yaml new file mode 100644 index 00000000..50792e95 --- /dev/null +++ b/tests/inversion/remplacement_2021.yaml @@ -0,0 +1,81 @@ +- name: "Chomage avec RFR sous seuil 1, personne seule" + revkire: 11400 + nbp: 100 + chomage_imposable: 19000 # revkire < 11408 & nbp =1 : taux is taux_exonere (0), base == gross + chomage_brut_test: 19000 +- name: "Chomage avec RFR sous seuil 2, personne seule, exonération" + revkire: 11410 + nbp: 100 + chomage_imposable: 19000 # 11408 < revkire <= 14914 & nbp =1 : taux is taux_reduit BUT chomage_imposable < seuil d'exo so net == gross + chomage_brut_test: 19000 +- name: "Chomage avec RFR sous seuil 2, personne seule, pas d'exonération" + revkire: 11410 + nbp: 100 + chomage_imposable: 20216 # 11408 < revkire <= 14914 & nbp =1 : taux is taux_reduit, chomage_imposable > seuil d'exo; applying abattement + chomage_brut_test: 21000 +- name: "Chomage avec RFR au-dessus seuil 2, personne seule, pas d'exonération" + revkire: 15000 + nbp: 100 + chomage_imposable: 20216 # 14914 < revkire & nbp =1 : taux is taux_plein, chomage_imposable > seuil d'exo; applying abattement + chomage_brut_test: 21000 +- name: "Chomage avec RFR sous seuil 1, couple, pas d'exonération" + revkire: 17498 # 17500 = 11408 + 3046*2 + nbp: 200 + chomage_imposable: 20000 # revkire < 17500 & nbp =2 : taux is taux_exonere (0), net == gross + chomage_brut_test: 20000 +- name: "Chomage avec RFR sous seuil 2, couple, pas d'exonération" + revkire: 17503 # 17500 = 11408 + 3046*2 + nbp: 200 + chomage_imposable: 20014 # revkire < 17500 & nbp =2 : taux is taux_reduit, chomage_imposable > seuil d'exo; applying abattement + chomage_brut_test: 20790 +- name: "Chomage avec RFR au dessus seuil 2, couple, pas d'exonération" + revkire: 22900 # 22878 = 14914 + 3982*2 + nbp: 200 + chomage_imposable: 20014 # revkire < 17500 & nbp =2 : taux is taux_reduit, chomage_imposable > seuil d'exo; applying abattement + chomage_brut_test: 20790 +- name: "Retraite avec RFR sous seuil 1, personne seule" + revkire: 11400 + nbp: 100 + retraite_imposable: 19000 + retraite_brute_test: 19000 +- name: "Retraite avec RFR sous seuil 1, personne seule" + revkire: 11400 + nbp: 100 + retraite_imposable: 20000 + retraite_brute_test: 20000 +- name: "Retraite avec RFR sous seuil 2, personne seule" + revkire: 11450 + nbp: 100 + retraite_imposable: 19000 + retraite_brute_test: 19750 # This checks that the chomage exoneration is indeed silent +- name: "Retraite avec RFR sous seuil 2, personne seule" + revkire: 11450 + nbp: 100 + retraite_imposable: 20000 + retraite_brute_test: 20790 # 20000/(1-0.038) +- name: "Retraite avec RFR sous seuil 3, personne seule" + revkire: 15000 + nbp: 100 + retraite_imposable: 20000 + retraite_brute_test: 20876 # 20000/(1-0.042) +- name: "Retraite avec RFR au dessus seuil 3, personne seule" + revkire: 24000 + nbp: 100 + retraite_imposable: 20000 + retraite_brute_test: 21254 # 20000/(1-0.059) +- name: "Retraite avec RFR sous seuil 2, couple" + revkire: 17503 # 17500 = 11408 + 3046*2 + nbp: 200 + retraite_imposable: 20000 + retraite_brute_test: 20790 # 20000/(1-0.038) +- name: "Retraite avec RFR sous seuil 3, couple" + revkire: 22900 # 22878 = 14914 + 3982*2 + nbp: 200 + retraite_imposable: 20000 + retraite_brute_test: 20876 # 20000/(1-0.042) +- name: "Retraite avec RFR au dessus seuil 3, couple" + revkire: 35510 # 35505 = 23147 + 2*6179 + nbp: 200 + retraite_imposable: 20000 + retraite_brute_test: 21254 # 20000/(1-0.059) + diff --git a/tests/inversion/salaire_2018.yaml b/tests/inversion/salaire_2018.yaml new file mode 100644 index 00000000..3ad3b57c --- /dev/null +++ b/tests/inversion/salaire_2018.yaml @@ -0,0 +1,69 @@ +- name: "Salaire salarié privé non-cadre temps plein, 1 PSS" + salaire_net: 31118.076 + heures_remunerees_volume: 1820.04 + contrat_de_travail: 0 + categorie_salarie: 0 + salaire_de_base_test: 39732 + +- name: "Salaire salarié privé non-cadre temps partiel, 1 PSS" + salaire_net: 31475.5572 + heures_remunerees_volume: 910.02 + contrat_de_travail: 1 + categorie_salarie: 0 + salaire_de_base_test: 39732 + +- name: "Salaire salarié privé cadre temps plein, 1 PSS" + salaire_net: 30725.7984 + heures_remunerees_volume: 1820.04 + contrat_de_travail: 0 + categorie_salarie: 1 + salaire_de_base_test: 39732 + +- name: "Salaire fonc pub Etat temps plein, 1 PSS" + salaire_net: 38994.42 + heures_remunerees_volume: 1820.04 + contrat_de_travail: 0 + categorie_salarie: 2 + traitement_brut_test: 39732 + +- name: "Salaire salarié privé non-cadre temps plein, 4 PSS" + salaire_net: 130193.424 + heures_remunerees_volume: 1820.04 + contrat_de_travail: 0 + categorie_salarie: 0 + salaire_de_base_test: 158928 + +- name: "Salaire salarié privé cadre temps plein, 4 PSS" + salaire_net: 126849.75 + heures_remunerees_volume: 1820.04 + contrat_de_travail: 0 + categorie_salarie: 1 + salaire_de_base_test: 158928 + +- name: "Salaire salarié pub Etat temps plein, 4 PSS" + salaire_net: 155974.728 + heures_remunerees_volume: 1820.04 + contrat_de_travail: 0 + categorie_salarie: 2 + traitement_brut_test: 158928 + +- name: "Salaire salarié privé non-cadre temps plein, 8 PSS" + salaire_net: 273325.644 + heures_remunerees_volume: 1820.04 + contrat_de_travail: 0 + categorie_salarie: 0 + salaire_de_base_test: 317856 + +- name: "Salaire salarié privé cadre temps plein, 8 PSS" + salaire_net: 255948.684 + heures_remunerees_volume: 1820.04 + contrat_de_travail: 0 + categorie_salarie: 1 + salaire_de_base_test: 317856 + +- name: "Salaire salarié pub Etat temps plein, 8 PSS" + salaire_net: 311935.596 + heures_remunerees_volume: 1820.04 + contrat_de_travail: 0 + categorie_salarie: 2 + traitement_brut_test: 317856 diff --git a/tests/inversion/salaire_2021.yaml b/tests/inversion/salaire_2021.yaml new file mode 100644 index 00000000..dabf7682 --- /dev/null +++ b/tests/inversion/salaire_2021.yaml @@ -0,0 +1,34 @@ +- name: "Salaire salarié privé non-cadre temps plein, 1 PSS" + salaire_net: 32505 + heures_remunerees_volume: 1820.04 + contrat_de_travail: 0 + categorie_salarie: 0 + salaire_de_base_test: 41136 + # assiette_cotisation 41136 + # assiette_csg 41136 + # csg_deductible 6.8 + # csg_imposable 2.4 + # crds 0.5 + # cotisations_sociales + ## agff 0 + ## agirc_arrco 3.15 + ## asf 0 + ## arrco 0 + ## ceg 0.86 + ## cet2019 0.14 + ## chomage 0 + ## vieillesse_deplafonnee 0.4 + ## vieillesse_plafonnee 6.9 + + # soit (6.8 + 2.4 + 0.5) * 0.9825 + 3.15 + 0.86 + 0.4 + 6.9 + # 20.84025 + +- name: "Salaire salarié privé non-cadre temps plein, juste au dessus d'un PSS" + salaire_net: 32515.1772 + heures_remunerees_volume: 1820.04 + contrat_de_travail: 0 + categorie_salarie: 0 + salaire_de_base_test: 41148 + + # soit (6.8 + 2.4 + 0.5) * 0.9825 + 3.15 + 0.86 + 0.14 + 0.4 + 6.9 + # 20.98025 diff --git a/tests/test_inversion.py b/tests/test_inversion.py new file mode 100644 index 00000000..8cc310d9 --- /dev/null +++ b/tests/test_inversion.py @@ -0,0 +1,122 @@ +import pandas as pd +from yaml import load, SafeLoader +import os +import re + +from openfisca_core.periods import period as p +from openfisca_core.taxscales import MarginalRateTaxScale, combine_tax_scales +from openfisca_core.formula_helpers import switch + +from openfisca_france import FranceTaxBenefitSystem + +from openfisca_france_data.felin.input_data_builder.create_variables_individuelles import create_taux_csg_remplacement +from openfisca_france_data.common import create_revenus_remplacement_bruts, create_salaire_de_base, create_traitement_indiciaire_brut + + + +margin = 1 + +tax_benefit_system = FranceTaxBenefitSystem() +scenario = tax_benefit_system.new_scenario() + +# Revenus de remplacement + +## First part : upwards (start from *_taxable, inverse to *_gross) + +### Data creation + +# cd = os.path.dirname(__file__) + +cd = "/home/paul/Documents/projets/openfisca-france-data/tests/" +path = os.path.join(cd, "inversion", "remplacement_2021.yaml") +year = re.match(".*([0-9]{4}).yaml", path).group(1) + +with open(path) as yaml: + individus = pd.DataFrame.from_dict(load(yaml, Loader=SafeLoader)) + +### Inverse incomes from net to gross : the tested functions + +create_taux_csg_remplacement(individus, p(year), tax_benefit_system) +create_revenus_remplacement_bruts(individus, p(year), tax_benefit_system) + +### Test against chomage_brut_test + +fails_chomage = [i for i in individus.index if abs(individus.loc[i]["chomage_brut"]-individus.loc[i]["chomage_brut_test"])>=margin] +fails_retraite = [i for i in individus.index if abs(individus.loc[i]["retraite_brute"]-individus.loc[i]["retraite_brute_test"])>=margin] + +message = "".join( + ["For test {}, found {} for chomage_brut, tested against {}.\n".format(i,individus.loc[i]["chomage_brut"],individus.loc[i]["chomage_brut_test"]) for i in fails_chomage]+ + ["For test {}, found {} for retraite_brute, tested against {}.\n".format(i,individus.loc[i]["retraite_brute"],individus.loc[i]["retraite_brute_test"]) for i in fails_retraite] + ) + +assert len(fails_chomage) + len(fails_retraite) ==0, "Some tests have failed.\n" + message + +# ## Second part : downwards (start from gross obtained from inversion, goes back to taxable) + +# ### Initialize the survey scenario with the gross (inverted) + +# init_single_entity(scenario, init_data) +# simulation = scenario.new_simulation() + +# ### Computes *_taxable back from inverted *_gross + +# simulation.calculate('chomage_imposable', '2021-01') == 1000 +# simulation.calculate('csg_deductible_chomage', '2021-01') == 0 +# simulation.calculate('csg_imposable_chomage', '2021-01') == 0 +# simulation.calculate('crds_chomage', '2021-01') == 1 + +# Salaire + +#cd = os.path.dirname(__file__) +#cd = "/home/paul/Documents/projets/openfisca-france-data/tests/" + +##### ##### ##### ##### ##### ##### ##### +##### Pre 2019 reform : inversion TAXIPP + +path = os.path.join(cd, "inversion", "salaire_2018.yaml") +year = re.match(".*([0-9]{4}).yaml", path).group(1) + +with open(path) as yaml: + individus = pd.DataFrame.from_dict(load(yaml, Loader=SafeLoader)) + +### Inverse incomes from net to gross : the tested functions + +create_salaire_de_base(individus, p(year), revenu_type = "net", tax_benefit_system=tax_benefit_system) +create_traitement_indiciaire_brut(individus, p(year), revenu_type = "net", tax_benefit_system=tax_benefit_system) + +### Test against salaire_de_base_test + +fails = [i for i in individus.index if + (abs(individus.loc[i]["salaire_de_base"]-individus.loc[i]["salaire_de_base_test"])>=margin) + or (abs(individus.loc[i]["traitement_indiciaire_brut"]-individus.loc[i]["traitement_brut_test"])>=margin)] + +message = "".join( + ["For test {}, found {} for salaire_de_base, tested against {}.\n".format(i,individus.loc[i]["salaire_de_base"],individus.loc[i]["salaire_de_base_test"]) for i in fails] + ) + +assert len(fails) == 0, "Some tests have failed.\n" + message + +##### ##### ##### ##### ##### ##### ##### +##### Post 2019 reform : let's anticipate (and help LexImpact ?) + +path = os.path.join(cd, "inversion", "salaire_2021.yaml") +year = re.match(".*([0-9]{4}).yaml", path).group(1) + +with open(path) as yaml: + individus = pd.DataFrame.from_dict(load(yaml, Loader=SafeLoader)) + +### Inverse incomes from net to gross : the tested functions + +create_salaire_de_base(individus, p(year), revenu_type = "net", tax_benefit_system=tax_benefit_system) + +### Test against salaire_de_base_test + +fails = [i for i in individus.index if abs(individus.loc[i]["salaire_de_base"]-individus.loc[i]["salaire_de_base_test"])>=margin] + +message = "".join( + ["For test {}, found {} for salaire_de_base, tested against {}.\n".format(i,individus.loc[i]["salaire_de_base"],individus.loc[i]["salaire_de_base_test"]) for i in fails] + ) + +assert len(fails) == 0, "Some tests have failed.\n" + message + +### Problem with the CET 2019 : non barem like (due on all tax base, but only if tax base > 1 PSS)