Skip to content

Commit

Permalink
refactor: gen_feeds_daily.pl (exports and database dumps) for all fla…
Browse files Browse the repository at this point in the history
…vors (#10973)

Now the the code bases for OFF, OPF, OBF and OPFF are unified, we can
have a single gen_feeds_daily.pl file for all flavors, as the
differences are minimal (only the pro platform off-pro is different).

This PR also removes hard coded paths like /srv/off/ so that the script
can be run in a dev environment (some utilities like python, mongodbdump
etc. are missing from dev but the rest of the script can run).
  • Loading branch information
stephanegigandet authored Nov 6, 2024
1 parent 7cbbec4 commit 2f10d93
Show file tree
Hide file tree
Showing 26 changed files with 165 additions and 259 deletions.
17 changes: 0 additions & 17 deletions conf/systemd/gen_feeds@.service

This file was deleted.

11 changes: 0 additions & 11 deletions conf/systemd/gen_feeds@.timer

This file was deleted.

2 changes: 1 addition & 1 deletion conf/systemd/gen_feeds_daily@.service
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ Environment=PERL5LIB=/srv/%i/lib/:/usr/local/share/perl/5.32.1/:/usr/lib/perl5
# Apache needs some environment variables like PRODUCT_OPENER_FLAVOR_SHORT
EnvironmentFile=/srv/%i/env/env.%i
WorkingDirectory=/srv/%i
ExecStart=/srv/%i/scripts/gen_feeds_daily_%i.sh
ExecStart=/srv/%i/scripts/gen_feeds_daily.sh


2 changes: 2 additions & 0 deletions lib/ProductOpener/Config.pm
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,6 @@ if (not defined $flavor) {
use Module::Load;
autoload("ProductOpener::Config_$flavor");

#

1;
3 changes: 1 addition & 2 deletions lib/ProductOpener/Config2_docker.pm
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,7 @@ $conf_root = "/opt/product-opener/conf";
$sftp_root = "/mnt/podata/sftp";
$geolite2_path = $ENV{GEOLITE2_PATH};

my $mongodb_url = $ENV{MONGODB_HOST} || "mongodb";
$mongodb_host = "mongodb://$mongodb_url:27017";
$mongodb_host = $ENV{MONGODB_HOST} || "mongodb";
$mongodb = $producers_platform ? "off-pro" : "off";
$mongodb_timeout_ms = 50000; # config option max_time_ms/maxTimeMS

Expand Down
2 changes: 1 addition & 1 deletion lib/ProductOpener/Config2_sample.pm
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ $sftp_root = "/home/sftp";
$geolite2_path = '/usr/local/share/GeoLite2-Country/GeoLite2-Country.mmdb';

$mongodb = "off"; # MongoDB database name
$mongodb_host = "mongodb://localhost";
$mongodb_host = "localhost";
$mongodb_timeout_ms = 50000; # config option max_time_ms/maxTimeMS

$memd_servers = ["127.0.0.1:11211"];
Expand Down
8 changes: 8 additions & 0 deletions lib/ProductOpener/Paths.pm
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,14 @@ A hashmap containing references to base directories

%BASE_DIRS = ();

=head3 $BASE_DIRS{SCRIPTS}
Directory for scripts
=cut

$BASE_DIRS{SCRIPTS} = _source_dir() . "/scripts";

=head3 $BASE_DIRS{LOGS}
Directory for logging
Expand Down
20 changes: 10 additions & 10 deletions scripts/export_csv_file.pl
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,13 @@
;

my %query_fields_values = ();
my $fields;
my $extra_fields;
my $fields = '';
my $extra_fields = '';
my $separator = "\t";
my $include_images_paths;
my $query_codes_from_file;
my $export_computed_fields;
my $export_canonicalized_tags_fields;
my $include_images_paths = 0;
my $query_codes_from_file = '';
my $export_computed_fields = 0;
my $export_canonicalized_tags_fields = 0;

GetOptions(
"fields=s" => \$fields,
Expand Down Expand Up @@ -119,7 +119,7 @@
}
}

if (defined $query_codes_from_file) {
if ($query_codes_from_file) {
my @codes = ();
open(my $in, "<", "$query_codes_from_file") or die("Cannot read $query_codes_from_file: $!\n");
while (<$in>) {
Expand All @@ -146,15 +146,15 @@
$args_ref->{export_canonicalized_tags_fields} = 1;
}

if ((defined $fields) and ($fields ne "")) {
if ($fields) {
$args_ref->{fields} = [split(/,/, $fields)];
}

if ((defined $extra_fields) and ($extra_fields ne "")) {
if ($extra_fields) {
$args_ref->{extra_fields} = [split(/,/, $extra_fields)];
}

if (defined $include_images_paths) {
if ($include_images_paths) {
$args_ref->{include_images_paths} = 1;
}

Expand Down
2 changes: 1 addition & 1 deletion scripts/export_database.pl
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ sub sanitize_field_content {
$code < 1 and next;

$count++;
print "$count \n" if ($count % 1000 == 0); # print number of products each 1000
print "$count \n" if ($count % 10000 == 0); # print number of products each 10000

foreach my $field (@export_fields) {

Expand Down
4 changes: 2 additions & 2 deletions scripts/export_products_data_and_images.pl
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@
$tar_cmd = "cvfz";
}
print STDERR "Executing tar command: tar $tar_cmd $products_file -C $BASE_DIRS{PRODUCTS} -T $tmp_file\n";
system('tar', $tar_cmd, $products_file, "-C", $BASE_DIRS{PRODUCTS}, "-T", $tmp_file);
system("tar $tar_cmd $products_file -C $BASE_DIRS{PRODUCTS} -T $tmp_file > /dev/null 2>&1");
}

if (defined $images_file) {
Expand All @@ -189,7 +189,7 @@
$tar_cmd = "cvfz";
}
print STDERR "Executing tar command: tar $tar_cmd $images_file -C $BASE_DIRS{PRODUCTS_IMAGES} -T $tmp_file\n";
system('tar', $tar_cmd, $images_file, "-C", $BASE_DIRS{PRODUCTS_IMAGES}, "-T", $tmp_file);
system("tar $tar_cmd $images_file -C $BASE_DIRS{PRODUCTS_IMAGES} -T $tmp_file > /dev/null 2>&1");
}

print STDERR "$i products exported.\n";
Expand Down
9 changes: 0 additions & 9 deletions scripts/gen_feeds.sh

This file was deleted.

113 changes: 105 additions & 8 deletions scripts/gen_feeds_daily.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,111 @@
#!/bin/sh
#!/usr/bin/env bash

# check we have the environment variables PRODUCT_OPENER_FLAVOR and PRODUCT_OPENER_FLAVOR_SHORT, otherwise exit

if [ -z "$PRODUCT_OPENER_FLAVOR" ] || [ -z "$PRODUCT_OPENER_FLAVOR_SHORT" ]; then
>&2 echo "Environment variables PRODUCT_OPENER_FLAVOR and PRODUCT_OPENER_FLAVOR_SHORT are required"
exit 1
fi

# this script must be launched from server root (e.g. /srv/off)
# check that we have a lib/ProductOpener/Paths.pm file, otherwise exit

if [ ! -f lib/ProductOpener/Paths.pm ]; then
>&2 echo "lib/ProductOpener/Paths.pm not found. ./scripts/gen_feeds_daily.sh must be launched from server root"
exit 1
fi

export PERL5LIB=lib:$PERL5LIB

# load paths
. <(perl -e 'use ProductOpener::Paths qw/:all/; print base_paths_loading_script()')

# load PRODUCT_OPENER_DOMAIN and MONGODB_HOST
. <(perl -e 'use ProductOpener::Config qw/:all/; print "export PRODUCT_OPENER_DOMAIN=$server_domain\nexport MONGODB_HOST=$mongodb_host";')

# we should now have PRODUCT_OPENER_DOMAIN set (from Config.pm in production mode), check it
if [ -z "$PRODUCT_OPENER_DOMAIN" ]; then
>&2 echo "Environment variable PRODUCT_OPENER_DOMAIN not set"
exit 1
fi

cd $OFF_SCRIPTS_DIR

# off-pro flavor: we don't generate most exports
# but we have some special processing
if [ "$PRODUCT_OPENER_FLAVOR" == "off-pro" ]; then
echo "Generating feeds for off-pro flavor"
./save_org_product_data_daily_off_pro.pl
echo "Skipping exports for off-pro flavor"
exit 0
fi

cd /srv/off/scripts
export PERL5LIB="../lib:${PERL5LIB}"

./remove_empty_products.pl
./compute_missions.pl
./gen_top_tags_per_country.pl

# Generate the CSV and RDF exports
./export_database.pl
./mongodb_dump.sh /srv/off/html openfoodfacts 127.0.0.1 off

cd /srv/off/html/data
gzip < en.openfoodfacts.org.products.rdf > en.openfoodfacts.org.products.rdf.gz
gzip < fr.openfoodfacts.org.products.rdf > fr.openfoodfacts.org.products.rdf.gz
cd $OFF_PUBLIC_DATA_DIR
for export in en.$PRODUCT_OPENER_DOMAIN.products.csv fr.$PRODUCT_OPENER_DOMAIN.products.csv en.$PRODUCT_OPENER_DOMAIN.products.rdf fr.$PRODUCT_OPENER_DOMAIN.products.rdf; do
nice pigz < $export > new.$export.gz
mv -f new.$export.gz $export.gz
done

# Copy CSV and RDF files to AWS S3 using MinIO client
mc cp \
en.$PRODUCT_OPENER_DOMAIN.products.csv \
en.$PRODUCT_OPENER_DOMAIN.products.csv.gz \
en.$PRODUCT_OPENER_DOMAIN.products.rdf \
fr.$PRODUCT_OPENER_DOMAIN.products.csv \
fr.$PRODUCT_OPENER_DOMAIN.products.csv.gz \
fr.$PRODUCT_OPENER_DOMAIN.products.rdf \
s3/openfoodfacts-ds

# Generate the MongoDB dumps and jsonl export
cd $OFF_SCRIPTS_DIR

./mongodb_dump.sh $OFF_PUBLIC_DATA_DIR $PRODUCT_OPENER_FLAVOR $MONGODB_HOST $PRODUCT_OPENER_FLAVOR_SHORT

# Small products data and images export for Docker dev environments
# for about 1/10000th of the products contained in production.
./export_products_data_and_images.pl --sample-mod 10000,0 \
--products-file $OFF_PUBLIC_EXPORTS_DIR/products.random-modulo-10000.tar.gz \
--images-file $OFF_PUBLIC_EXPORTS_DIR/products.random-modulo-10000.images.tar.gz \
--jsonl-file $OFF_PUBLIC_EXPORTS_DIR/products.random-modulo-10000.jsonl.gz \
--mongo-file $OFF_PUBLIC_EXPORTS_DIR/products.random-modulo-10000.mongodbdump.gz
# On saturday, export modulo 1000 for larger sample
if [ "$(date +%u)" = "6" ]
then
./export_products_data_and_images.pl --sample-mod 1000,0 \
--products-file $OFF_PUBLIC_EXPORTS_DIR/products.random-modulo-1000.tar.gz \
--images-file $OFF_PUBLIC_EXPORTS_DIR/products.random-modulo-1000.images.tar.gz \
--jsonl-file $OFF_PUBLIC_EXPORTS_DIR/products.random-modulo-1000.jsonl.gz \
--mongo-file $OFF_PUBLIC_EXPORTS_DIR/products.random-modulo-1000.mongodbdump.gz
fi

# Generate small CSV dump for the offline mode of the mobile app
# parameters are passed through environment variables

# 2024/11/06: this script has been broken for a year in production, it will be reimplemented
# in the upcoming openfoodfacts-export service

# python3 $OFF_SCRIPTS_DIR/generate_dump_for_offline_apps.py
# cd $OFF_PUBLIC_DATA_DIR/offline
# zip new.en.$PRODUCT_OPENER_DOMAIN.products.small.csv.zip en.$PRODUCT_OPENER_DOMAIN.products.small.csv
# mv new.en.$PRODUCT_OPENER_DOMAIN.products.small.csv.zip en.$PRODUCT_OPENER_DOMAIN.products.small.csv.zip

# Exports for Carrefour
cd $OFF_SCRIPTS_DIR
./export_csv_file.pl --fields code,nutrition_grades_tags --query editors_tags=carrefour --separator ';' > $OFF_PUBLIC_DATA_DIR/exports/carrefour_nutriscore.csv

./export_csv_file.pl --fields code,nutrition_grades_tags --separator ';' > $OFF_PUBLIC_DATA_DIR/exports/nutriscore.csv

# On OFF and on Sunday, generates madenearme pages
if [ "$PRODUCT_OPENER_FLAVOR" == "off" ] && [ "$(date +%u)" = "7" ]
then
./generate_madenearme_pages.sh
fi


16 changes: 0 additions & 16 deletions scripts/gen_feeds_daily_obf.sh

This file was deleted.

12 changes: 0 additions & 12 deletions scripts/gen_feeds_daily_off-pro.sh

This file was deleted.

70 changes: 0 additions & 70 deletions scripts/gen_feeds_daily_off.sh

This file was deleted.

Loading

0 comments on commit 2f10d93

Please sign in to comment.