diff --git a/.github/labeler.yml b/.github/labeler.yml index babefa64c92c8..05ff365ea71b1 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -505,21 +505,6 @@ Data import: - any-glob-to-any-file: 'cgi/generate_sample_import_file.pl' # https://openfoodfacts.github.io/openfoodfacts-server/dev/ref-perl-pod/ProductOpener/Ingredients.html -🥗 Ingredients: -- changed-files: - - any-glob-to-any-file: 'lib/ProductOpener/Ingredients.pm' - - any-glob-to-any-file: 'taxonomies/food/ingredients.txt' - - any-glob-to-any-file: 'tests/unit/ingredients.t' - - any-glob-to-any-file: 'tests/unit/ingredients_analysis.t' - - any-glob-to-any-file: 'tests/unit/ingredients_clean.t' - - any-glob-to-any-file: 'tests/unit/ingredients_nesting.t' - - any-glob-to-any-file: 'tests/unit/ingredients_parsing.t' - - any-glob-to-any-file: 'tests/unit/ingredients_parsing_todo.t' - - any-glob-to-any-file: 'tests/unit/ingredients_percent.t' - - any-glob-to-any-file: 'tests/unit/ingredients_processing.t' - - any-glob-to-any-file: 'tests/unit/ingredients_tags.t' - - any-glob-to-any-file: 'scripts/test_ingredient_parser.pl' - # We want to improve the analysis of ingredient list to extract ingredients and their properties, across languages. # This is helpful to determine if a product is vegan, vegetarian, contains palm oil, is kosher/halal, the exact Nutri-Score, how much environmental impact it has… # https://wiki.openfoodfacts.org/Ingredients_Extraction_and_Analysis @@ -538,7 +523,23 @@ Data import: - any-glob-to-any-file: 'scripts/extract_individual_ingredients.pl' - any-glob-to-any-file: 'scripts/aggregate_ingredients.pl' - any-glob-to-any-file: 'lib/ProductOpener/Ingredients.pm' - + - any-glob-to-any-file: 'tests/unit/ingredients_parsing.t' + - any-glob-to-any-file: 'lib/ProductOpener/Ingredients.pm' + - any-glob-to-any-file: 'taxonomies/food/ingredients.txt' + - any-glob-to-any-file: 'tests/unit/ingredients.t' + - any-glob-to-any-file: 'tests/unit/ingredients_analysis.t' + - any-glob-to-any-file: 'tests/unit/ingredients_clean.t' + - any-glob-to-any-file: 'tests/unit/ingredients_nesting.t' + - any-glob-to-any-file: 'tests/unit/ingredients_parsing_todo.t' + - any-glob-to-any-file: 'tests/unit/ingredients_percent.t' + - any-glob-to-any-file: 'tests/unit/ingredients_processing.t' + - any-glob-to-any-file: 'tests/unit/ingredients_tags.t' + - any-glob-to-any-file: 'scripts/test_ingredient_parser.pl' + - any-glob-to-any-file: 'tests/unit/expected_test_results/ingredients/en-category-types.json' + - any-glob-to-any-file: 'tests/unit/expected_test_results/ingredients/fr-infinite-loop-allergens.json' + - any-glob-to-any-file: 'tests/unit/expected_test_results/ingredients/fr-marmelade.json' + - any-glob-to-any-file: 'tests/unit/expected_test_results/ingredients/fr-percents-origins-2.json' + - any-glob-to-any-file: 'tests/unit/expected_test_results/ingredients/ru-russian-oil.json' # Labels are all claims present on product packages. # https://wiki.openfoodfacts.org/Labels # Tracking issue: diff --git a/.github/workflows/generate-doc.yml b/.github/workflows/generate-doc.yml index b1a1abd8b5c20..b049e3df7ad72 100644 --- a/.github/workflows/generate-doc.yml +++ b/.github/workflows/generate-doc.yml @@ -81,7 +81,7 @@ jobs: - name: Deploy API documentation to GitHub Pages - uses: JamesIves/github-pages-deploy-action@v4.6.9 + uses: JamesIves/github-pages-deploy-action@v4.7.2 # we only deploy on push to main if: | github.event_name == 'push' && github.event.ref == 'refs/heads/main' diff --git a/cgi/product_image_move.pl b/cgi/product_image_move.pl index cd2e0f951de7f..874d2a220495a 100644 --- a/cgi/product_image_move.pl +++ b/cgi/product_image_move.pl @@ -48,10 +48,7 @@ my $code = normalize_code(single_param('code')); my $imgids = single_param('imgids'); my $move_to = single_param('move_to_override'); -if ($move_to =~ /^(off|obf|opf|opff)$/) { - $move_to .= ':' . $code; -} -elsif ($move_to ne 'trash') { +if ($move_to ne 'trash') { $move_to = normalize_code($move_to); } my $copy_data = single_param('copy_data_override'); @@ -223,14 +220,6 @@ $response{url} = product_url($move_to); - # URL on another server? - my $server = server_for_product_id($move_to); - if (defined $server) { - my $url = "https://" . $subdomain . "." . $options{other_servers}{$server}{domain} . $response{url}; - $url =~ s/\/([a-z]+):([0-9])/\/$2/; - $response{url} = $url; - } - $response{link} = '' . $move_to . ''; } diff --git a/docs/dev/how-to-quick-start-guide.md b/docs/dev/how-to-quick-start-guide.md index 3c17e00e776ae..38c88b69c685f 100644 --- a/docs/dev/how-to-quick-start-guide.md +++ b/docs/dev/how-to-quick-start-guide.md @@ -39,14 +39,21 @@ The process of cloning the repository will create a number of symbolic links whi Make sure you also activated the [Developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development) on your device. -### Windows Subsystem for Linux (WSL) Prequisites +### Windows Subsystem for Linux (WSL) Prerequisites Ensure that you have WSL installed on your Windows machine. For instructions on how to do so, you can follow [Microsoft's guide to install WSL](https://learn.microsoft.com/en-us/windows/wsl/install). After succcessfully installing WSL, you need to set up your Linux distribution and install Docker on it: - Go to the Microsoft Store and install [Ubuntu](https://www.microsoft.com/store/productId/9PDXGNCFSCZV?ocid=pdpshare) as your Linux distribution - Open Ubuntu and execute the commands/instructions specified in [Installing Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/) (**NOTE: this is NOT the same thing as Docker for Desktop) -- Restart your computer to ensure all changes take effect and WSL can properly integrate with Docker +- Restart your computer to ensure all changes take effect and WSL can properly integrate with Docker + +### Docker Desktop Prerequisite + +If you use Docker Desktop: + +- ensure you allow enough memory for your VMs (at least 4G) +- ensure you Enabled host networking (in Resources / Network) ## 2. Fork and clone the repository from GitHub diff --git a/lib/ProductOpener/Display.pm b/lib/ProductOpener/Display.pm index 898bdf9159188..1c3081681902b 100644 --- a/lib/ProductOpener/Display.pm +++ b/lib/ProductOpener/Display.pm @@ -2596,6 +2596,11 @@ HTML $extra_column_searchable .= ', {"searchable": false}'; } + # additive table has an extra column for risks + if ($tagtype eq 'additives') { + $extra_column_searchable .= ', {"searchable": false}'; + } + $request_ref->{initjs} .= <debug("PAGINATION: START\n", {count => $count, limit => $limit, page => $page}) if $log->is_debug(); @@ -5982,29 +5986,7 @@ sub display_pagination ($request_ref, $count, $limit, $page) { . "\n"; } - # Close the list - - if (defined single_param("jqm")) { - if (defined $next_page_url) { - my $loadmore = lang("loadmore"); - $html .= <$loadmore -HTML - ; - } - else { - $html .= '

'; - } - } - - if (not defined $request_ref->{jqm_loadmore}) { - $html .= "\n"; - } - - if (not defined single_param("jqm")) { - $html .= $html_pages; - } - return $html; + return $html_pages; } sub search_and_export_products ($request_ref, $query_ref, $sort_by) { diff --git a/lib/ProductOpener/Images.pm b/lib/ProductOpener/Images.pm index 85364f84f0479..339907f3eeeb5 100644 --- a/lib/ProductOpener/Images.pm +++ b/lib/ProductOpener/Images.pm @@ -1050,11 +1050,11 @@ sub process_image_upload ($product_id, $imagefield, $user_id, $time, $comment, $ # Create a link to the image in /new_images so that it can be batch processed by OCR # and computer vision algorithms - (-e "$BASE_DIRS{PRODUCTS}/new_images") or mkdir("$BASE_DIRS{PRODUCTS}/new_images", 0755); + (-e "$BASE_DIRS{CACHE_NEW_IMAGES}") or mkdir("$BASE_DIRS{CACHE_NEW_IMAGES}", 0755); my $code = $product_id; $code =~ s/.*\///; symlink("$target_image_dir/$imgid.jpg", - "$BASE_DIRS{PRODUCTS}/new_images/" . time() . "." . $code . "." . $imagefield . "." . $imgid . ".jpg"); + "$BASE_DIRS{CACHE_NEW_IMAGES}/" . time() . "." . $code . "." . $imagefield . "." . $imgid . ".jpg"); # Save the image file size so that we can skip the image before processing it if it is uploaded again $images_ref->{$size_orig} = $imgid; diff --git a/lib/ProductOpener/Ingredients.pm b/lib/ProductOpener/Ingredients.pm index 604851118e571..3f66071f994b0 100644 --- a/lib/ProductOpener/Ingredients.pm +++ b/lib/ProductOpener/Ingredients.pm @@ -174,6 +174,10 @@ my $separators_except_comma = qr/(;|:|$middle_dot|\[|\{|\(|\N{U+FF08}|( $dashes my $separators = qr/($stops\s|$commas|$separators_except_comma)/i; +# Symbols to indicate labels like organic, fairtrade etc. +my @symbols = ('\*\*\*', '\*\*', '\*', '°°°', '°°', '°', '\(1\)', '\(2\)', '¹', '²'); +my $symbols_regexp = join('|', @symbols); + # do not add sub ( ) in the regexps below as it would change which parts gets matched in $1, $2 etc. in other regexps that use those regexps # put the longest strings first, so that we can match "possible traces" before "traces" my %may_contain_regexps = ( @@ -533,6 +537,84 @@ my %of_finished_product = ( sv => " sylt", ); +=head1 FUNCTIONS + +=head2 init_percent_or_quantity_regexps($ingredients_lc) - initialize regular expressions needed for ingredients parsing + +This function creates regular expressions that match quantities or percent of an ingredient, +including localized strings like "minimum" + +=cut + +# prepared with +my %prepared_with = ( + en => "(?:made|prepared|produced) with", + da => "fremstillet af", + es => "elabora con", + fr => "(?:(?:é|e)labor(?:é|e)|fabriqu(?:é|e)|pr(?:é|e)par(?:é|e)|produit)(?:e)?(?:s)? (?:avec|à partir)", + hr => "(?:proizvedeno od|sadrži)", + nl => "bereid met", + sv => "är", +); + +my %min_regexp = ( + en => "min|min\.|minimum", + ca => "min|min\.|mín|mín\.|mínim|minim", + es => "min|min\.|mín|mín\.|mínimo|minimo|minimum", + fr => "min|min\.|mini|minimum", + hr => "min|min\.|mini|minimum", + pl => "min|min\.|minimum", +); + +my %max_regexp = ( + en => "max|max\.|maximum", + ca => "max|max\.|màxim", + es => "max|max\.|máximo", + fr => "max|max\.|maxi|maximum", + hr => "max|max\.|maxi|maximum", + pl => "max|max\.|maximum", +); + +# Words that can be ignored after a percent +# e.g. 50% du poids total, 30% of the total weight +# groups need to be non-capturing: prefixed with (?: + +my %ignore_strings_after_percent = ( + en => "of (?:the )?(?:total weight|grain is wholegrain rye)", + es => "(?:en el chocolate(?: con leche)?)", + fi => "jauhojen määrästä", + fr => "(?:dans le chocolat(?: (?:blanc|noir|au lait))?)|(?:du poids total|du poids)", + sv => "fetthalt", +); + +my %percent_or_quantity_regexps = (); + +sub init_percent_or_quantity_regexps($ingredients_lc) { + + if (not exists $percent_or_quantity_regexps{$ingredients_lc}) { + + my $prepared_with = $prepared_with{$ingredients_lc} || '', + + my $min_regexp = $min_regexp{$ingredients_lc} || ''; + + my $max_regexp = $max_regexp{$ingredients_lc} || ''; + + my $ignore_strings_after_percent = $ignore_strings_after_percent{$ingredients_lc} || ''; + + # Regular expression to find percent or quantities + # $percent_or_quantity_regexp has 2 capturing group: one for the number, and one for the % sign or the unit + $percent_or_quantity_regexps{$ingredients_lc} = '(?:' . "(?:$prepared_with )" . ' )?' # optional produced with + . '(?:>|' . $max_regexp . '|<|' . $min_regexp . '|\s|\.|:)*' # optional maximum, minimum, and separators + . '(?:\d+(?:[,.]\d+)?\s*-\s*?)?' # number+hyphens, first part (10-) of "10-12%" + . '(\d+(?:(?:\,|\.)\d+)?)\s*' # number, possibly with a dot or comma + . '(\%|g|gr|mg|kg|ml|cl|dl|l)\s*' # % or unit + . '(?:' . $min_regexp . '|' . $max_regexp . '|' # optional minimum, optional maximum + . $ignore_strings_after_percent . '|\s|\)|\]|\}|\*)*'; # strings that can be ignored + } + + return; +} + # Labels that we want to recognize in the ingredients # e.g. "fraises issues de l'agriculture biologique" @@ -555,8 +637,6 @@ my %labels_regexps = (); # Needs to be called after Tags.pm has loaded taxonomies -=head1 FUNCTIONS - =head2 init_labels_regexps () - initialize regular expressions needed for ingredients parsing This function creates regular expressions that match all variations of labels @@ -728,47 +808,6 @@ sub extract_ingredients_from_image ($product_ref, $id, $ocr_engine, $results_ref return; } -# prepared with -my %prepared_with = ( - en => "(?:made|prepared|produced) with", - da => "fremstillet af", - es => "elabora con", - fr => "(?:(?:é|e)labor(?:é|e)|fabriqu(?:é|e)|pr(?:é|e)par(?:é|e)|produit)(?:e)?(?:s)? (?:avec|à partir)", - hr => "(?:proizvedeno od|sadrži)", - nl => "bereid met", - sv => "är", -); - -my %min_regexp = ( - en => "min|min\.|minimum", - ca => "min|min\.|mín|mín\.|mínim|minim", - es => "min|min\.|mín|mín\.|mínimo|minimo|minimum", - fr => "min|min\.|mini|minimum", - hr => "min|min\.|mini|minimum", - pl => "min|min\.|minimum", -); - -my %max_regexp = ( - en => "max|max\.|maximum", - ca => "max|max\.|màxim", - es => "max|max\.|máximo", - fr => "max|max\.|maxi|maximum", - hr => "max|max\.|maxi|maximum", - pl => "max|max\.|maximum", -); - -# Words that can be ignored after a percent -# e.g. 50% du poids total, 30% of the total weight -# groups need to be non-capturing: prefixed with (?: - -my %ignore_strings_after_percent = ( - en => "of (?:the )?(?:total weight|grain is wholegrain rye)", - es => "(?:en el chocolate(?: con leche)?)", - fi => "jauhojen määrästä", - fr => "(?:dans le chocolat(?: (?:blanc|noir|au lait))?)|(?:du poids total|du poids)", - sv => "fetthalt", -); - =head2 has_specific_ingredient_property ( product_ref, searched_ingredient_id, property ) Check if the specific ingredients structure (extracted from the end of the ingredients list and product labels) @@ -1891,28 +1930,12 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref, $ my $and = $and{$ingredients_lc} || " and "; - my $prepared_with = $prepared_with{$ingredients_lc} || '', - - my $min_regexp = $min_regexp{$ingredients_lc} || ''; - - my $max_regexp = $max_regexp{$ingredients_lc} || ''; - - my $ignore_strings_after_percent = $ignore_strings_after_percent{$ingredients_lc} || ''; - - # Regular expression to find percent or quantities - # $percent_or_quantity_regexp has 2 capturing group: one for the number, and one for the % sign or the unit - my $percent_or_quantity_regexp = '(?:' . "(?:$prepared_with )" . ' )?' # optional produced with - . '(?:>|' . $max_regexp . '|<|' . $min_regexp . '|\s|\.|:)*' # optional maximum, minimum, and separators - . '(?:\d+(?:[,.]\d+)?\s*-\s*?)?' # number+hyphens, first part (10-) of "10-12%" - . '(\d+(?:(?:\,|\.)\d+)?)\s*' # number, possibly with a dot or comma - . '(\%|g|gr|mg|kg|ml|cl|dl|l)\s*' # % or unit - . '(?:' . $min_regexp . '|' . $max_regexp . '|' # optional minimum, optional maximum - . $ignore_strings_after_percent . '|\s|\)|\]|\}|\*)*'; # strings that can be ignored - my $per = $per{$ingredients_lc} || ' per '; my $of_finished_product = $of_finished_product{$ingredients_lc} || ''; my $per_100g_regexp = "(${per}|\/)${one_hundred_grams_or_ml}(?:$of_finished_product)?"; + my $percent_or_quantity_regexp = $percent_or_quantity_regexps{$ingredients_lc}; + # Extract phrases related to specific ingredients at the end of the ingredients list $text = parse_specific_ingredients_from_text($product_ref, $text, $percent_or_quantity_regexp, $per_100g_regexp); @@ -4548,7 +4571,20 @@ sub normalize_fr_a_de_b ($a, $b) { } } -=head2 normalize_a_of_b ( $lc, $a, $b, $of_bool, $alternate_names_ref ) +# This function removes labels like "organic" from ingredients, so that we can check if they exist +# with canonicalize_taxonomy_tag. The labels can be parsed out when doing ingredients analysis. + +sub remove_parsable_labels ($ingredients_lc, $ingredient) { + if ($ingredients_lc eq "en") { + $ingredient =~ s/(?:organic |fair trade )*//ig; + } + elsif ($ingredients_lc eq "fr") { + $ingredient =~ s/(?: bio| biologique| équitable|s|\s|' . $symbols_regexp . ')//ig; + } + return $ingredient; +} + +=head2 normalize_a_of_b ( $lc, $a, $b, $of_bool, $alternate_names_ref = undef ) This function is called by normalize_enumeration() @@ -4590,18 +4626,18 @@ string, comma-joined category and type, example: 'palm vegetal oil' or 'sunflowe =cut -sub normalize_a_of_b ($lc, $a, $b, $of_bool, $alternate_names_ref = undef) { +sub normalize_a_of_b ($ingredients_lc, $a, $b, $of_bool, $alternate_names_ref = undef) { $a =~ s/\s+$//; $b =~ s/^\s+//; my $a_of_b; - if (($lc eq "en") or ($lc eq "hr")) { + if (($ingredients_lc eq "en") or ($ingredients_lc eq "hr")) { # start by "with" (example: "mlijeko (s 1.0% mliječne masti)"), in which case it $b should be added after $a # start by "with etc." should be added at the end of the previous ingredient my %with = (hr => '(s | sa )',); - my $with = $with{$lc} || " will not match "; + my $with = $with{$ingredients_lc} || " will not match "; if ($b =~ /^$with/i) { $a_of_b = $a . " " . $b; } @@ -4609,10 +4645,10 @@ sub normalize_a_of_b ($lc, $a, $b, $of_bool, $alternate_names_ref = undef) { $a_of_b = $b . " " . $a; } } - elsif ($lc eq "es") { + elsif ($ingredients_lc eq "es") { $a_of_b = $a . " de " . $b; } - elsif ($lc eq "fr") { + elsif ($ingredients_lc eq "fr") { $b =~ s/^(de |d')//; if (($b =~ /^(a|e|i|o|u|y|h)/i) && ($of_bool == 1)) { @@ -4625,11 +4661,11 @@ sub normalize_a_of_b ($lc, $a, $b, $of_bool, $alternate_names_ref = undef) { $a_of_b = $a . " " . $b; } } - elsif (($lc eq "de") or ($lc eq "ru") or ($lc eq "pl")) { + elsif (($ingredients_lc eq "de") or ($ingredients_lc eq "ru") or ($ingredients_lc eq "pl")) { $a_of_b = $a . " " . $b; } else { - die("unsupported language in normalize_a_of_b: $lc, $a, $b"); + die("unsupported language in normalize_a_of_b: $ingredients_lc, $a, $b"); } # If we have alternate categories, check if $a_of_b is an existing taxonomy entry, @@ -4638,7 +4674,11 @@ sub normalize_a_of_b ($lc, $a, $b, $of_bool, $alternate_names_ref = undef) { if (defined $alternate_names_ref) { my $name_exists; - canonicalize_taxonomy_tag($lc, "ingredients", $a_of_b, \$name_exists); + # remove labels like "organic", "fairtrade": they can be parsed out when doing ingredients analysis + # TODO: use the labels regexps instead + my $a_of_b_copy = remove_parsable_labels($ingredients_lc, $a_of_b); + canonicalize_taxonomy_tag($ingredients_lc, "ingredients", $a_of_b_copy, \$name_exists); + print STDERR "a: $a - b: $b - $a_of_b: $a_of_b - a_of_b_copy: $a_of_b_copy: - $name_exists\n"; if (not $name_exists) { foreach my $alternate_name (@{$alternate_names_ref}) { @@ -4646,7 +4686,10 @@ sub normalize_a_of_b ($lc, $a, $b, $of_bool, $alternate_names_ref = undef) { = $alternate_name; # make a copy so that we can modify it without changing the array entry $alternate_name_copy =~ s//$b/; my $alternate_name_exists; - canonicalize_taxonomy_tag($lc, "ingredients", $alternate_name_copy, \$alternate_name_exists); + canonicalize_taxonomy_tag($ingredients_lc, "ingredients", $alternate_name_copy, + \$alternate_name_exists); + print STDERR + "alternate_name: $alternate_name - alternate_name_copy: $alternate_name_copy: - $alternate_name_exists\n"; if ($alternate_name_exists) { $a_of_b = $alternate_name_copy; last; @@ -4658,7 +4701,7 @@ sub normalize_a_of_b ($lc, $a, $b, $of_bool, $alternate_names_ref = undef) { return $a_of_b; } -=head2 normalize_enumeration ($lc, $category, $types, $of_bool, $alternate_names_ref = undef) +=head2 normalize_enumeration ($ingredients_lc, $category, $types, $of_bool, $alternate_names_ref = undef, $do_not_output_parent = undef) This function is called by develop_ingredients_categories_and_types() @@ -4668,7 +4711,7 @@ Some ingredients are specified by an ingredient "category" (e.g. "oil") and a "t This function combines the category to all elements of the types string $category = "Vegetal oil" and $types = "palm, sunflower and olive" will return -"palm vegetal oil, sunflower vegetal oil, olive vegetal oil" +"vegetal oil (palm vegetal oil, sunflower vegetal oil, olive vegetal oil)" =head3 Arguments @@ -4678,21 +4721,41 @@ language abbreviation (en for English, for example) =head4 category -string, as defined in %ingredients_categories_and_types, example: 'Vegetal oil' for 'Vegetal oil (sunflower, olive and palm)' +string, as matched from definition in %ingredients_categories_and_types, example: 'Vegetal oil' for 'Vegetal oil (sunflower, olive and palm)' =head4 types -string, as defined in %ingredients_categories_and_types, example: 'sunflower, olive and palm' for 'Vegetal oil (sunflower, olive and palm)' +string, as matched from definition in %ingredients_categories_and_types, example: 'sunflower, olive and palm' for 'Vegetal oil (sunflower, olive and palm)' + +=head4 $of_bool - indicate if we want to construct entries like " of " + +e.g. in French we combine "huile" and "olive" to "huile d'olive" +but we combine "poivron" and "rouge" to "poivron rouge". + +=head4 $alternate_names_ref + +Reference to an array of alternate names for the category + +=head4 $do_not_output_parent - indicate if we want to output the parent ingredient + +e.g. for "carbonates d'ammonium et de sodium", we want only "carbonates d'ammonium, carbonates de sodium" +and not "carbonates (carbonates d'ammonium, carbonates de sodium)" as "carbonates" is another additive =head3 Return value =head4 Transformed ingredients list text -string, comma-joined category with all elements of the types, example: 'sunflower vegetal oil, olive vegetal oil, palm vegetal oil' +string, with the type + a list of comma-joined category with all elements of the types +example: 'vegetal oils (sunflower vegetal oil, olive vegetal oil, palm vegetal oil)' =cut -sub normalize_enumeration ($lc, $category, $types, $of_bool, $alternate_names_ref = undef) { +sub normalize_enumeration ( + $ingredients_lc, $category, $types, $of_bool, + $alternate_names_ref = undef, + $do_not_output_parent = undef + ) +{ $log->debug("normalize_enumeration", {category => $category, types => $types}) if $log->is_debug(); # If there is a trailing space, save it and output it @@ -4702,12 +4765,27 @@ sub normalize_enumeration ($lc, $category, $types, $of_bool, $alternate_names_re } # do not match anything if we don't have a translation for "and" - my $and = $and{$lc} || " will not match "; + my $and = $and{$ingredients_lc} || " will not match "; my @list = split(/$obrackets|$cbrackets|\/| \/ | $dashes |$commas |$commas|$and/i, $types); - return - join(", ", map {normalize_a_of_b($lc, $category, $_, $of_bool, $alternate_names_ref)} @list) . $trailing_space; + # If we have a percent or quantity, we output it only for the parent + my $category_without_percent_or_quantity = $category; + my $percent_or_quantity_regexp = $percent_or_quantity_regexps{$ingredients_lc}; + $category_without_percent_or_quantity =~ s/$percent_or_quantity_regexp//ig; + + my $list = join( + ", ", + map { + normalize_a_of_b($ingredients_lc, $category_without_percent_or_quantity, $_, $of_bool, $alternate_names_ref) + } @list + ); + + unless ($do_not_output_parent) { + $list = $category . " (" . $list . ")"; + } + + return $list . $trailing_space; } # iodure et hydroxide de potassium @@ -4716,12 +4794,12 @@ sub normalize_fr_a_et_b_de_c ($a, $b, $c) { return normalize_fr_a_de_b($a, $c) . ", " . normalize_fr_a_de_b($b, $c); } -sub normalize_additives_enumeration ($lc, $enumeration) { +sub normalize_additives_enumeration ($ingredients_lc, $enumeration) { $log->debug("normalize_additives_enumeration", {enumeration => $enumeration}) if $log->is_debug(); # do not match anything if we don't have a translation for "and" - my $and = $and{$lc} || " will not match "; + my $and = $and{$ingredients_lc} || " will not match "; my @list = split(/$obrackets|$cbrackets|\/| \/ | $dashes |$commas |$commas|$and/i, $enumeration); @@ -5802,16 +5880,8 @@ my %ingredients_categories_and_types = ( # huiles { categories => [ - "huile", - "huile végétale", - "huiles végétales", - "matière grasse", - "matières grasses", - "matière grasse végétale", - "matières grasses végétales", - "graisse", - "graisse végétale", - "graisses végétales", + # allow multiple types of oils in the category (e.g. "huiles et graisses"), with modifiers (e.g. "végétale") + '(?:(?: et )?(?:huile|graisse|stéarine|matière\s? grasse)s?)+(?: (?:végétale|(?:partiellement |totalement |non(?:-| |))hydrogénée?)s?)*', ], types => [ "arachide", "avocat", "carthame", "chanvre", @@ -5821,7 +5891,14 @@ my %ingredients_categories_and_types = ( "olive vierge", "olive extra vierge", "olive vierge extra", "palme", "palmiste", "pépins de raisin", "sal", "sésame", "soja", "tournesol", "tournesol oléique", - ] + ], + alternate_names => [ + "huile de ", + "huile d'", + "matière grasse de ", + "graisse de ", + "stéarine de " + ], }, # (natural) extract { @@ -5887,7 +5964,10 @@ my %ingredients_categories_and_types = ( types => [ "aluminium", "ammonium", "calcium", "cuivre", "fer", "magnésium", "manganèse", "potassium", "sodium", "zinc", - ] + ], + # avoid turning "carbonates d'ammonium et de sodium" into "carbonates (carbonates d'ammonium, carbonates de sodium)" + # as "carbonates" is an additive + do_not_output_parent => 1, }, # peppers {categories => ["piment", "poivron"], types => ["vert", "jaune", "rouge",], of_bool => 0,}, @@ -5922,7 +6002,7 @@ my %ingredients_categories_and_types = ( "voćni", ] }, - # falvouring + # flavouring { categories => ["prirodna aroma", "prirodne arome",], types => ["citrusa sa ostalim prirodnim aromama", "limuna", "mente", "mente s drugim prirodnim aromama",] @@ -6023,7 +6103,7 @@ my %ingredients_categories_and_types = ( ru => [ # oils { - categories => ["масло", "масло растительное",], + categories => ['масло(?: растительное)?',], types => [ "Подсолнечное", "Пальмовое", "Рапсовое", "Кокосовое", "горчицы", "Соевое", "Пальмоядровое", "Оливковое", "пальм", @@ -6033,16 +6113,16 @@ my %ingredients_categories_and_types = ( ); -# Symbols to indicate labels like organic, fairtrade etc. -my @symbols = ('\*\*\*', '\*\*', '\*', '°°°', '°°', '°', '\(1\)', '\(2\)', '¹', '²'); -my $symbols_regexp = join('|', @symbols); - sub develop_ingredients_categories_and_types ($ingredients_lc, $text) { $log->debug("develop_ingredients_categories_and_types", {ingredients_lc => $ingredients_lc, text => $text}) if $log->is_debug(); if (defined $ingredients_categories_and_types{$ingredients_lc}) { + my $percent_or_quantity_regexp = $percent_or_quantity_regexps{$ingredients_lc}; + # Make the 2 capture groups (for number and for % or unit, starting with (\d and (\% non capturing + $percent_or_quantity_regexp =~ s/\(\\/\(?:\\/g; + foreach my $categories_and_types_ref (@{$ingredients_categories_and_types{$ingredients_lc}}) { my $category_regexp = ""; foreach my $category (@{$categories_and_types_ref->{categories}}) { @@ -6051,7 +6131,6 @@ sub develop_ingredients_categories_and_types ($ingredients_lc, $text) { if ($unaccented_category ne $category) { $category_regexp .= '|' . $unaccented_category . '|' . $unaccented_category . 's'; } - } $category_regexp =~ s/^\|//; @@ -6066,6 +6145,9 @@ sub develop_ingredients_categories_and_types ($ingredients_lc, $text) { $category_regexp = '(?:' . $category_regexp . ')(?:' . $symbols_regexp . ')*'; } + # Also match % after the category (e.g. "vegetal oil 45% (palm, rapeseed)" + $category_regexp .= '\s*(?:' . $percent_or_quantity_regexp . ')?'; + my $type_regexp = ""; foreach my $type (@{$categories_and_types_ref->{types}}) { $type_regexp .= '|' . $type . '|' . $type . 's'; @@ -6076,6 +6158,8 @@ sub develop_ingredients_categories_and_types ($ingredients_lc, $text) { } $type_regexp =~ s/^\|//; + #$log->debug("develop_ingredients_categories_and_types", { category_regexp => $category_regexp, type_regexp => $type_regexp}) if $log->is_debug(); + my $of_bool = 1; if (defined $categories_and_types_ref->{of_bool}) { $of_bool = $categories_and_types_ref->{of_bool}; @@ -6109,19 +6193,20 @@ sub develop_ingredients_categories_and_types ($ingredients_lc, $text) { or ($ingredients_lc eq "pl")) { # vegetable oil (palm, sunflower and olive) -> palm vegetable oil, sunflower vegetable oil, olive vegetable oil + # nNte: not using the /x modifier to put spaces in the regexp, as it doesn't work if the interpolated variables contain spaces themselves... $text - =~ s/($category_regexp)(?::|\(|\[| | $of )+((($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, |$and|$of|$and_of|$and_or)+)+($type_regexp)($symbols_regexp|\s)*)\b(\s?(\)|\]))?/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names})/ieg; + =~ s/($category_regexp)(?::|\(|\[| | $of )+((($type_regexp)($symbols_regexp|\s)*(\s|\/|\s\/\s|\s-\s|,|,\s|$and|$of|$and_of|$and_or)+)+($type_regexp)($symbols_regexp|\s)*)\b(\s?(\)|\]))?/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names},$categories_and_types_ref->{do_not_output_parent})/ieg; # vegetable oil (palm) -> palm vegetable oil $text - =~ s/($category_regexp)\s?(?:\(|\[)\s?($type_regexp)\b(\s?(\)|\]))/normalize_enumeration($ingredients_lc,$1,$2,$of_bool,$categories_and_types_ref->{alternate_names})/ieg; + =~ s/($category_regexp)\s?(?:\(|\[)\s?($type_regexp)\b(\s?(\)|\]))/normalize_enumeration($ingredients_lc,$1,$2,$of_bool,$categories_and_types_ref->{alternate_names},$categories_and_types_ref->{do_not_output_parent})/ieg; # vegetable oil: palm $text - =~ s/($category_regexp)\s?(?::)\s?($type_regexp)(?=$separators|.|$)/normalize_enumeration($ingredients_lc,$1,$2,$of_bool,$categories_and_types_ref->{alternate_names})/ieg; + =~ s/($category_regexp)\s?(?::)\s?($type_regexp)(?=$separators|.|$)/normalize_enumeration($ingredients_lc,$1,$2,$of_bool,$categories_and_types_ref->{alternate_names},$categories_and_types_ref->{do_not_output_parent})/ieg; # ječmeni i pšenični slad (barley and wheat malt) -> ječmeni slad, pšenični slad $text - =~ s/((?:(?:$type_regexp)(?: |\/| \/ | - |,|, |$and|$of|$and_of|$and_or)+)+(?:$type_regexp))\s*($category_regexp)/normalize_enumeration($ingredients_lc,$2,$1,$of_bool,$categories_and_types_ref->{alternate_names})/ieg; + =~ s/((?:(?:$type_regexp)(?: |\/| \/ | - |,|, |$and|$of|$and_of|$and_or)+)+(?:$type_regexp))\s*($category_regexp)/normalize_enumeration($ingredients_lc,$2,$1,$of_bool,$categories_and_types_ref->{alternate_names},$categories_and_types_ref->{do_not_output_parent})/ieg; } elsif ($ingredients_lc eq "fr") { # arôme naturel de pomme avec d'autres âromes @@ -6140,20 +6225,20 @@ sub develop_ingredients_categories_and_types ($ingredients_lc, $text) { # require a " et " and/or " de " at the end of the enumeration # $text - =~ s/($category_regexp)(?::| | de | d')+((($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, | et | de | et de | et d'| d')+)*($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, )*( et | de | et de | et d'| d'| d'autres | et d'autres )( |\/| \/ | - |,|, )*($type_regexp)($symbols_regexp|\s)*)\b/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names})/ieg; + =~ s/($category_regexp)(?::| | de | d')+((($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, | et | de | et de | et d'| d')+)*($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, )*( et | de | et de | et d'| d'| d'autres | et d'autres )( |\/| \/ | - |,|, )*($type_regexp)($symbols_regexp|\s)*)\b/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names},$categories_and_types_ref->{do_not_output_parent})/ieg; # Huiles végétales (palme, colza et tournesol) $text - =~ s/($category_regexp)(?:\(|\[)(?:de |d')?((($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, | et | de | et de | et d'| d')+)+($type_regexp)($symbols_regexp|\s)*)\b(\s?(\)|\]))/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names})/ieg; + =~ s/($category_regexp)(?:\(|\[)(?:de |d')?((($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, | et | de | et de | et d'| d')+)+($type_regexp)($symbols_regexp|\s)*)\b(\s?(\)|\]))/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names},$categories_and_types_ref->{do_not_output_parent})/ieg; $text =~ s/fer_élémentaire/fer élémentaire/ig; # huile végétale (colza) $text - =~ s/($category_regexp)\s?(?:\(|\[)\s?($type_regexp)\b(\s?(\)|\]))/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names})/ieg; + =~ s/($category_regexp)\s?(?:\(|\[)\s?($type_regexp)\b(\s?(\)|\]))/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names}, $categories_and_types_ref->{do_not_output_parent})/ieg; # huile végétale : colza, $text - =~ s/($category_regexp)\s?(?::)\s?($type_regexp)(?=$separators|.|$)/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names})/ieg; + =~ s/($category_regexp)\s?(?::)\s?($type_regexp)(?=$separators|.|$)/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names}, $categories_and_types_ref->{do_not_output_parent})/ieg; } } @@ -6261,6 +6346,8 @@ sub preparse_ingredients_text ($ingredients_lc, $text) { init_origins_regexps(); } + init_percent_or_quantity_regexps($ingredients_lc); + my $and = $and{$ingredients_lc} || " and "; my $and_without_spaces = $and; $and_without_spaces =~ s/^ //; diff --git a/scripts/import_csv_file.pl b/scripts/import_csv_file.pl index 209143c3a5dd7..b714eba1797d1 100755 --- a/scripts/import_csv_file.pl +++ b/scripts/import_csv_file.pl @@ -131,7 +131,7 @@ "skip_existing_values" => \$skip_existing_values, "only_select_not_existing_images" => \$only_select_not_existing_images, "use_brand_owner_as_org_name" => \$use_brand_owner_as_org_name, -) or die("Error in command line arguments:\n$\nusage"); +) or die("Error in command line arguments:\n\n$usage"); print STDERR "import_csv_file.pl - user_id: $user_id diff --git a/stop_words.txt b/stop_words.txt index 8b7e67bb3b9cb..1fa509739073c 100644 --- a/stop_words.txt +++ b/stop_words.txt @@ -11,6 +11,8 @@ AgriBalyse AGS Alimentarius Allergènes +ammonium +d'ammonium Anses ANSES api diff --git a/templates/web/common/includes/donate_banner.tt.html b/templates/web/common/includes/donate_banner.tt.html index 2aeac2679288d..3ad09a5fbe364 100644 --- a/templates/web/common/includes/donate_banner.tt.html +++ b/templates/web/common/includes/donate_banner.tt.html @@ -13,8 +13,8 @@
open food facts logo @@ -117,8 +117,8 @@

[% lang("donation_title_2024") %]