From d40bcf67e8729449aca293d3a6a33a50b6720d26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Thu, 5 Sep 2024 10:43:57 +0200 Subject: [PATCH] feat: script to extract historical data for a field (e.g. Nutri-Score) for a set of products (#10408) Needed to get Nutri-Score labels on packaging, at different years, for a study. Sample usage for the Nutricourses study: ``` off@off:/home/stephane/nutricourses$ (off) more nutricourses.sh #/bin/sh export PERL5LIB=/srv/off/lib /srv/off/scripts/extract_historical_product_data.pl --min-year 2017 --max-year 2024 --recompute-taxonomies --field label s_tags=en:nutriscore-grade- --omit-prefix --codes-file nutricourses.csv ``` Sample output: ![image](https://github.com/openfoodfacts/openfoodfacts-server/assets/8158668/a7517008-9b6f-4e87-a71d-0118c4b0c640) --- lib/ProductOpener/Display.pm | 159 +++++++---- scripts/extract_historical_product_data.pl | 318 +++++++++++++++++++++ scripts/update_all_products.pl | 46 +-- stop_words.txt | 3 +- 4 files changed, 425 insertions(+), 101 deletions(-) create mode 100755 scripts/extract_historical_product_data.pl diff --git a/lib/ProductOpener/Display.pm b/lib/ProductOpener/Display.pm index 550590b74eb56..f763bd4aab648 100644 --- a/lib/ProductOpener/Display.pm +++ b/lib/ProductOpener/Display.pm @@ -102,6 +102,7 @@ BEGIN { &count_products &add_params_to_query + &add_params_and_filters_to_query &url_for_text &process_template @@ -1661,9 +1662,7 @@ sub generate_query_cache_key ($name, $context_ref, $request_ref) { sub query_list_of_tags ($request_ref, $query_ref) { - add_params_to_query($request_ref, $query_ref); - - add_country_and_owner_filters_to_query($request_ref, $query_ref); + add_params_and_filters_to_query($request_ref, $query_ref); my $groupby_tagtype = $request_ref->{groupby_tagtype}; @@ -4457,23 +4456,23 @@ HTML return; } -=head2 list_all_request_params ( $request_ref, $query_ref ) +=head2 get_all_request_params ( $request_ref ) -Return an array of names of all request parameters. +Return a reference to a hash of all request parameters (CGI params and JSON body) name and values. =cut -sub list_all_request_params ($request_ref) { +sub get_all_request_params ($request_ref) { # CGI params (query string and POST body) - my @params = multi_param(); + my %params = map {$_ => decode("utf8", single_param($_))} multi_param(); - # Add params from the JSON body if any + # Add params from the JSON body if any (JSON body overwrites CGI params if there is a conflict) if (defined $request_ref->{body_json}) { - push @params, keys %{$request_ref->{body_json}}; + @params{keys %{$request_ref->{body_json}}} = values %{$request_ref->{body_json}}; } - return @params; + return \%params; } =head2 display_search_results ( $request_ref ) @@ -4503,17 +4502,22 @@ sub display_search_results ($request_ref) { my $current_link = ''; - foreach my $field (list_all_request_params($request_ref)) { + my $params_ref = get_all_request_params($request_ref); + + $log->debug("display_search_results - params", {params => $params_ref}) + if $log->is_debug(); + + foreach my $field (sort keys %$params_ref) { if ( - ($field eq "page") - or ($field eq "fields") - or ($field eq "keywords") # returned by CGI.pm when there are not params: keywords=search + not( + ($field eq "page") + or ($field eq "fields") + or ($field eq "keywords") # returned by CGI.pm when there are not params: keywords=search + ) ) { - next; + $current_link .= "\&$field=" . URI::Escape::XS::encodeURIComponent($params_ref->{$field}); } - - $current_link .= "\&$field=" . URI::Escape::XS::encodeURIComponent(decode utf8 => single_param($field)); } $current_link =~ s/^\&/\?/; @@ -4706,20 +4710,22 @@ sub get_products_collection_request_parameters ($request_ref, $additional_parame return $parameters_ref; } -=head2 add_params_to_query ( $request_ref, $query_ref ) +=head2 add_params_and_filters_to_query ( $request_ref, $query_ref ) This function is used to parse search query parameters that are passed to the API (/api/v?/search endpoint) or to the web site search (/search endpoint) -either as query string parameters (e.g. ?labels_tags=en:organic) or -POST parameters. +either as query string parameters (e.g. ?labels_tags=en:organic), +POST parameters, or POST JSON body parameters. -The function adds the corresponding query filters in the MongoDB query. +The function then adds the corresponding query filters in the MongoDB query. + +It also adds the country and owner filters to the query. =head3 Parameters -=head4 $request_ref (output) +=head4 $request_ref (input) -Reference to the internal request object. +Reference to the request object. =head4 $query_ref (output) @@ -4750,6 +4756,57 @@ my %ignore_params = ( no_count => 1, ); +sub add_params_and_filters_to_query($request_ref, $query_ref) { + + my $params_ref = get_all_request_params($request_ref); + + # Filter out parameters that are not query filters + foreach my $field (keys %$params_ref) { + + if (defined $ignore_params{$field}) { + delete $params_ref->{$field}; + } + # Some parameters like page / page_size and sort_by are related to the query + # but not query filters, we set them at the request object level + elsif (($field eq "page") or ($field eq "page_size")) { + $request_ref->{$field} = $params_ref->{$field} + 0; # Make sure we have a number + delete $params_ref->{$field}; + } + + elsif ($field eq "sort_by") { + $request_ref->{$field} = $params_ref->{$field}; + delete $params_ref->{$field}; + } + } + + add_params_to_query($params_ref, $query_ref); + + add_country_and_owner_filters_to_query($request_ref, $query_ref); + + return; +} + +=head2 add_params_to_query ( $params_ref, $query_ref ) + +This function is used to parse search query parameters that are passed +to the API (/api/v?/search endpoint) or to the web site search (/search endpoint) +either as query string parameters (e.g. ?labels_tags=en:organic), +POST parameters, or POST JSON body parameters. + +The function then adds the corresponding query filters in the MongoDB query. + +=head3 Parameters + +=head4 $params_ref (input) + +Reference to a hash of parameters (name and value). + +=head4 $query_ref (output) + +Reference to the MongoDB query object. + +=cut + # Parameters that can be query filters passed as parameters # (GET query parameters, POST JSON body or from url facets), # in addition to tags fields. @@ -4757,36 +4814,26 @@ my %ignore_params = ( my %valid_params = (code => 1, creator => 1); -sub add_params_to_query ($request_ref, $query_ref) { +sub add_params_to_query ($params_ref, $query_ref) { - $log->debug("add_params_to_query", {params => {CGI::Vars()}}) if $log->is_debug(); + $log->debug("add_params_to_query", {params => $params_ref}) if $log->is_debug(); # nocache was renamed to no_cache - if (defined single_param('nocache')) { - param('no_cache', single_param('nocache')); + if (defined $params_ref->{nocache}) { + $params_ref->{no_cache} = $params_ref->{nocache}; + delete $params_ref->{nocache}; } my $and = $query_ref->{"\$and"}; - foreach my $field (list_all_request_params($request_ref)) { + foreach my $field (sort keys %$params_ref) { $log->debug("add_params_to_query - field", {field => $field}) if $log->is_debug(); - # skip params that are not query filters - next if (defined $ignore_params{$field}); - - if (($field eq "page") or ($field eq "page_size")) { - $request_ref->{$field} = single_param($field) + 0; # Make sure we have a number - } - - elsif ($field eq "sort_by") { - $request_ref->{$field} = single_param($field); - } - # Tags fields can be passed with taxonomy ids as values (e.g labels_tags=en:organic) # or with values in a given language (e.g. labels_tags_fr=bio) - elsif ($field =~ /^(.*)_tags(_(\w\w))?/) { + if ($field =~ /^(.*)_tags(_(\w\w))?/) { my $tagtype = $1; my $tag_lc = $lc; if (defined $3) { @@ -4800,7 +4847,7 @@ sub add_params_to_query ($request_ref, $query_ref) { # xyz_tags=-c products without the c tag # xyz_tags=a,b,-c,-d - my $values = remove_tags_and_quote(request_param($request_ref, $field)); + my $values = remove_tags_and_quote($params_ref->{$field}); $log->debug("add_params_to_query - tags param", {field => $field, lc => $lc, tag_lc => $tag_lc, values => $values}) @@ -4938,7 +4985,7 @@ sub add_params_to_query ($request_ref, $query_ref) { # We can have multiple conditions, separated with a comma # e.g. sugars_100g=>10,<=20 - my $conditions = request_param($request_ref, $field); + my $conditions = $params_ref->{$field}; $log->debug("add_params_to_query - nutrient conditions", {field => $field, conditions => $conditions}) if $log->is_debug(); @@ -4956,7 +5003,7 @@ sub add_params_to_query ($request_ref, $query_ref) { } else { $operator = '='; - $value = request_param($request_ref, $field); + $value = $params_ref->{$field}; } $log->debug("add_params_to_query - nutrient condition", @@ -4986,7 +5033,7 @@ sub add_params_to_query ($request_ref, $query_ref) { # Exact match on a specific field (e.g. "code") elsif (defined $valid_params{$field}) { - my $values = remove_tags_and_quote(request_param($request_ref, $field)); + my $values = remove_tags_and_quote($params_ref->{$field}); # Possible values: # xyz=a @@ -5055,13 +5102,11 @@ sub search_and_display_products ($request_ref, $query_ref, $sort_by, $limit, $pa my $cache_results_flag = scalar(not $request_ref->{is_crawl_bot}); my $template_data_ref = {}; - add_params_to_query($request_ref, $query_ref); - $log->debug("search_and_display_products", {request_ref => $request_ref, query_ref => $query_ref, sort_by => $sort_by}) if $log->is_debug(); - add_country_and_owner_filters_to_query($request_ref, $query_ref); + add_params_and_filters_to_query($request_ref, $query_ref); if (defined $limit) { } @@ -5911,9 +5956,7 @@ sub search_and_export_products ($request_ref, $query_ref, $sort_by) { $format = $request_ref->{format}; } - add_params_to_query($request_ref, $query_ref); - - add_country_and_owner_filters_to_query($request_ref, $query_ref); + add_params_and_filters_to_query($request_ref, $query_ref); $log->debug("search_and_export_products - MongoDB query", {format => $format, query => $query_ref}) if $log->is_debug(); @@ -6816,9 +6859,7 @@ HTML sub search_and_graph_products ($request_ref, $query_ref, $graph_ref) { - add_params_to_query($request_ref, $query_ref); - - add_country_and_owner_filters_to_query($request_ref, $query_ref); + add_params_and_filters_to_query($request_ref, $query_ref); my $cursor; @@ -7168,9 +7209,7 @@ Base query that will be modified to be able to build the map sub search_products_for_map ($request_ref, $query_ref) { - add_params_to_query($request_ref, $query_ref); - - add_country_and_owner_filters_to_query($request_ref, $query_ref); + add_params_and_filters_to_query($request_ref, $query_ref); my $cursor; @@ -10865,9 +10904,7 @@ XML sub display_recent_changes ($request_ref, $query_ref, $limit, $page) { - add_params_to_query($request_ref, $query_ref); - - add_country_and_owner_filters_to_query($request_ref, $query_ref); + add_params_and_filters_to_query($request_ref, $query_ref); if (defined $limit) { } @@ -11458,9 +11495,7 @@ Analyze the distribution of selected parent ingredients in the searched products sub search_and_analyze_recipes ($request_ref, $query_ref) { - add_params_to_query($request_ref, $query_ref); - - add_country_and_owner_filters_to_query($request_ref, $query_ref); + add_params_and_filters_to_query($request_ref, $query_ref); my $cursor; diff --git a/scripts/extract_historical_product_data.pl b/scripts/extract_historical_product_data.pl new file mode 100755 index 0000000000000..09f4cbb7a3da8 --- /dev/null +++ b/scripts/extract_historical_product_data.pl @@ -0,0 +1,318 @@ +#!/usr/bin/perl -w + +# This file is part of Product Opener. +# +# Product Opener +# Copyright (C) 2011-2024 Association Open Food Facts +# Contact: contact@openfoodfacts.org +# Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France +# +# Product Opener is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +use ProductOpener::PerlStandards; + +my $usage = < $query_params_ref, # filters for mongodb query + "codes-file=s" => \$codes_file, # file with product codes to extract data for + "field=s" => \$field_to_extract, + "omit-prefix" => \$omit_prefix, + "recompute-taxonomies" => + \$recompute_taxonomies, # Recompute tag fields like categories and labels with the current taxonomies + "analyze-and-enrich-product-data" => \$analyze_and_enrich_product_data, + "min-year=i" => \$min_year, + "max-year=i" => \$max_year, +) or die("Error in command line arguments:\n\n$usage"); + +my @codes = (); + +# Get the list of product codes from a file +if (defined $codes_file) { + open(my $fh, '<', $codes_file) or die("Could not open file '$codes_file' $!"); + while (my $row = <$fh>) { + chomp $row; + $row =~ s/\t.*//; # Assume codes are in the first column; remove the rest + push @codes, $row; + } + close $fh; + my $products_count = scalar @codes; + print STDERR "$products_count documents retrieved from MongoDB\n"; + +} +else { + # Use query filters entered using --query categories_tags=en:plant-milks + + # Build the mongodb query from the --query parameters + my $query_ref = {}; + + add_params_to_query($query_params_ref, $query_ref); + + use Data::Dumper; + print STDERR "MongoDB query:\n" . Dumper($query_ref); + + my $socket_timeout_ms = 2 * 60000; # 2 mins, instead of 30s default, to not die as easily if mongodb is busy. + + # Collection that will be used to iterate products + # TODO: we should query both the products and obsolete products collections + my $products_collection = get_products_collection({obsolete => 0, timeout => $socket_timeout_ms}); + + my $products_count = ""; + + eval { + $products_count = $products_collection->count_documents($query_ref); + + print STDERR "$products_count documents retrieved from MongoDB\n"; + }; + + my $cursor = $products_collection->query($query_ref)->fields({_id => 1, code => 1, owner => 1}); + $cursor->immortal(1); + + while (my $product_ref = $cursor->next) { + push @codes, $product_ref->{code}; + } +} + +# load data needed to analyze and enrich products +if ($analyze_and_enrich_product_data) { + load_data(); +} + +# Print the header +my $years = ""; +for (my $year = $min_year; $year <= $max_year; $year++) { + $years .= "\t" . $year; +} + +print "code" . "\t" . "found" . "\t" . "url" . $years . "\n"; + +# Go through all products +my $products = 0; +my $found_products = 0; + +sub save_product_field_value_for_year($product_ref, $field_to_extract, $year, $value_per_year_ref) { + + my $value; + + # Make sure we have a lc field + if ((not defined $product_ref->{lc}) and (defined $product_ref->{lang})) { + $product_ref->{lc} = $product_ref->{lang}; + } + + # This option runs all the data enrichment functions + if ($analyze_and_enrich_product_data) { + analyze_and_enrich_product_data($product_ref, {}); + } + + # Value of a tag field that has a specific prefix + # e.g. labels_tags=en:nutriscore- + # We will take the value of the first tag that has the prefix + + if ($field_to_extract =~ /^(.+)_tags=(.+)$/) { + my $tagtype = $1; + my $prefix = $2; + # *_tags fields may contain old canonical tags, we can recompute tag fields with the newest taxonomy + if (($recompute_taxonomies) and (defined $taxonomy_fields{$tagtype})) { + # if the field was previously not taxonomized, the $field_hierarchy field does not exist + # assume the $field value is in the main language of the product + if ( + + defined $product_ref->{$tagtype . "_hierarchy"} + ) + { + # we do not know the language of the current value of $product_ref->{$tagtype} + # so regenerate it in English + + $product_ref->{$tagtype} + = list_taxonomy_tags_in_language("en", $tagtype, $product_ref->{$tagtype . "_hierarchy"}); + } + + compute_field_tags($product_ref, "en", $tagtype); + } + foreach my $tag (@{$product_ref->{$tagtype . "_tags"}}) { + if ($tag =~ /^$prefix/) { + if ($omit_prefix) { + $value = $'; + } + else { + $value = $tag; + } + last; + } + } + } + else { + # Access any field in the product + # e.g. nutriments.energy_100g + $value = deep_get($product_ref, split(/\./, $field_to_extract)); + } + $value_per_year_ref->{$year} = $value; + + return; +} + +foreach my $code (@codes) { + + $products++; + + my $productid = $code; + my $path = product_path_from_id($productid); + my $found = 0; + + # We will go through revision one by one, and for the requested field, we will + # store the value we have at the beginning of each year + my %value_per_year = (); + my $url = ""; + + # Go through all product revisions + my $changes_ref = retrieve("$BASE_DIRS{PRODUCTS}/$path/changes.sto"); + + # If we don't have a changes.sto file, the product is not in the database + if (defined $changes_ref) { + $found = 1; + $found_products++; + + # Go through all revisions, keep the latest value of all fields + + my %deleted_values = (); + my $previous_product_ref; + my $revs = 0; + my $current_year; + + foreach my $change_ref (@{$changes_ref}) { + $revs++; + my $rev = $change_ref->{rev}; + if (not defined $rev) { + $rev = $revs; # was not set before June 2012 + } + + my $product_ref = retrieve("$BASE_DIRS{PRODUCTS}/$path/$rev.sto"); + + if (defined $product_ref) { + + # Determine the year of the revision, using the UNIX timestamp last_modified_t + my $year = (localtime($product_ref->{last_modified_t}))[5] + 1900; + + # We consider the value at the beginning of the year + # So the last value of the previous year is the value for the current year + # Save the value if the year has changed + if ((not defined $current_year) or ($year != $current_year)) { + if ($current_year) { + save_product_field_value_for_year($previous_product_ref, $field_to_extract, $current_year + 1, + \%value_per_year); + } + $current_year = $year; + } + + # Keep a reference to the previous revision + $previous_product_ref = $product_ref; + } + } + + # Save the value for the last year + if ($current_year) { + save_product_field_value_for_year($previous_product_ref, $field_to_extract, $current_year + 1, + \%value_per_year); + } + + if ($previous_product_ref) { + $url = "https://world.$server_domain" . product_url($previous_product_ref); + } + } + + # Assign values for missing years and output values between min_year and max_year + my $values = ""; + for (my $year = 2000; $year <= $max_year; $year++) { + if (not defined $value_per_year{$year}) { + $value_per_year{$year} = $value_per_year{$year - 1}; + } + if ($year >= $min_year) { + $values .= "\t" . ($value_per_year{$year} || ''); + } + } + + print $code . "\t" . $found . "\t" . $url . $values . "\n"; +} + +print STDERR "$products products processed, $found_products products found in the database\n"; + +exit(0); diff --git a/scripts/update_all_products.pl b/scripts/update_all_products.pl index a9417ef80fff8..fc25fe850f3a1 100755 --- a/scripts/update_all_products.pl +++ b/scripts/update_all_products.pl @@ -36,8 +36,10 @@ --count do not do any processing, just count the number of products matching the --query options --just-print-codes do not do any processing, just print the barcodes ---query some_field=some_value (e.g. categories_tags=en:beers) filter the products +--query some_field=some_value (e.g. categories_tags=en:beers) filter the products (--query parameters can be repeated to have multiple filters) --query some_field=-some_value match products that don't have some_value for some_field +--query some_field=value1,value2 match products that have value1 and value2 for some_field (must be a _tags field) +--query some_field=value1\|value2 match products that have value1 or value2 for some_field (must be a _tags field) --analyze-and-enrich-product-data run all the analysis and enrichments --process-ingredients compute allergens, additives detection --clean-ingredients remove nutrition facts, conservation conditions etc. @@ -150,11 +152,11 @@ my $fix_obsolete; my $fix_last_modified_t; # Will set the update key and ensure last_updated_t is initialised -my $query_ref = {}; # filters for mongodb query +my $query_params_ref = {}; # filters for mongodb query GetOptions( "key=s" => \$key, # string - "query=s%" => $query_ref, + "query=s%" => $query_params_ref, "count" => \$count, "just-print-codes" => \$just_print_codes, "fields=s" => \@fields_to_update, @@ -304,42 +306,10 @@ # Get a list of all products not yet updated # Use query filters entered using --query categories_tags=en:plant-milks -use boolean; +# Build the mongodb query from the --query parameters +my $query_ref = {}; -foreach my $field (sort keys %{$query_ref}) { - - my $not = 0; - - if ($query_ref->{$field} =~ /^-/) { - $query_ref->{$field} = $'; - $not = 1; - } - - if ($query_ref->{$field} eq 'null') { - # $query_ref->{$field} = { '$exists' => false }; - $query_ref->{$field} = undef; - } - elsif ($query_ref->{$field} eq 'exists') { - $query_ref->{$field} = {'$exists' => true}; - } - elsif ($field =~ /_t$/) { # created_t, last_modified_t etc. - $query_ref->{$field} += 0; - } - # Multiple values separated by commas - elsif ($query_ref->{$field} =~ /,/) { - my @tagids = split(/,/, $query_ref->{$field}); - - if ($not) { - $query_ref->{$field} = {'$nin' => \@tagids}; - } - else { - $query_ref->{$field} = {'$in' => \@tagids}; - } - } - elsif ($not) { - $query_ref->{$field} = {'$ne' => $query_ref->{$field}}; - } -} +add_params_to_query($query_params_ref, $query_ref); # Query products that have the _id field stored as a number if ($fix_non_string_ids) { diff --git a/stop_words.txt b/stop_words.txt index cb3159ae2b16c..b505e2dedb4ae 100644 --- a/stop_words.txt +++ b/stop_words.txt @@ -196,6 +196,7 @@ pageId pageIdAction pageviewPosition param +params Pâtes pectine Perl @@ -292,4 +293,4 @@ redis init licious Odoo -CRM \ No newline at end of file +CRM