Add comments for Web Discovery

brave · Jun 27, 2024 · 3cfb538 · 3cfb538
1 parent fbb2577
commit 3cfb538
Show file tree

Hide file tree

Showing 21 changed files with 230 additions and 10 deletions.
diff --git a/components/web_discovery/browser/content_scraper.h b/components/web_discovery/browser/content_scraper.h
@@ -33,6 +33,9 @@ struct PageScrapeResult {
   static std::unique_ptr<PageScrapeResult> FromValue(const base::Value& dict);
 
   GURL url;
+  // A map of DOM selectors to list of scraped values embedded in a Dict.
+  // Each dict contains arbitrary keys (defined in the patterns) to scraped
+  // values.
   base::flat_map<std::string, std::vector<base::Value::Dict>> fields;
   std::string id;
 
@@ -41,6 +44,18 @@ struct PageScrapeResult {
   std::optional<std::string> query;
 };
 
+// Extracts attribute values from the page DOM for reporting purposes.
+// ContentScraper utilizes the following techniques:
+//
+// a) Extraction within the current page in the renderer (via `ScrapePage`).
+//    The `mojom::DocumentExtractor` is used to request attribute values
+//    from the current DOM in the view. Typically, this is used to exact a
+//    search query, and decide whether the page is worthy of investigation
+//    and reporting.
+// b) Parsing and extracting HTML from a double fetch. This follows
+//    the extraction in a). Used to extract all other needed details
+//    from the page i.e. search results. Uses a Rust library for DOM
+//    operations, in respect of Rule of Two.
 class ContentScraper {
  public:
   using PageScrapeResultCallback =

diff --git a/components/web_discovery/browser/credential_manager.h b/components/web_discovery/browser/credential_manager.h
@@ -35,6 +35,16 @@ struct GenerateJoinRequestResult {
   std::string signature;
 };
 
+// Manages and utilizes anonymous credentials used for communicating
+// with Web Discovery servers. These Direct Anonymous Attestation credentials
+// are used to prevent Sybil attacks on the servers.
+// The manager provides two key functions:
+//
+// a) "joining": acquires credentials from the Web Discovery server. Join
+// requests
+//    are signed with a random RSA key that is persisted with the profile.
+// b) "signing": uses the previously acquired credentials to sign submissions
+//    which is required in order for the servers to accept the request.
 class CredentialManager : public CredentialSigner {
  public:
   CredentialManager(PrefService* profile_prefs,
@@ -45,6 +55,8 @@ class CredentialManager : public CredentialSigner {
   CredentialManager(const CredentialManager&) = delete;
   CredentialManager& operator=(const CredentialManager&) = delete;
 
+  // Acquires credentials for all dates/"group public keys" published in
+  // the server config, if not stored already.
   void JoinGroups();
 
   // CredentialSigner:
@@ -54,6 +66,9 @@ class CredentialManager : public CredentialSigner {
             std::vector<const uint8_t> basename,
             SignCallback callback) override;
 
+  // Uses a fixed seed in the anonymous credential manager
+  // to provide deterministic credentials & signatures which
+  // are useful for testing.
   void UseFixedSeedForTesting();
 
  private:

diff --git a/components/web_discovery/browser/credential_signer.h b/components/web_discovery/browser/credential_signer.h
@@ -19,7 +19,16 @@ class CredentialSigner {
       base::OnceCallback<void(std::optional<std::vector<const uint8_t>>)>;
   virtual ~CredentialSigner();
 
+  // Returns true is a credential is available for the current date.
+  // The caller can expect future calls to `Sign` to succeed, if made today.
   virtual bool CredentialExistsForToday() = 0;
+
+  // Signs a message for a given basename. The server has the ability
+  // to check whether two messages with the same basename were signed
+  // with the same credential without revealing the credential used,
+  // preventing Sybil attacks.
+  // See signature_basename.h/cc for more information on how the basename
+  // should be generated.
   virtual bool Sign(std::vector<const uint8_t> msg,
                     std::vector<const uint8_t> basename,
                     SignCallback callback) = 0;

diff --git a/components/web_discovery/browser/document_extractor/rs/src/lib.rs b/components/web_discovery/browser/document_extractor/rs/src/lib.rs
@@ -17,27 +17,39 @@ use kuchikiki::{
 #[cxx::bridge(namespace = "rust_document_extractor")]
 mod ffi {
     pub struct SelectAttributeRequest {
+        /// An optional selector for an element within the current selected element.
+        /// The attribute will be retrieved from the embedded element.
+        /// If not needed, an empty string should be provided.
         pub sub_selector: String,
+        /// Arbitrary ID used for storing the scraped result.
         pub key: String,
+        /// Name of the attribute to scrape.
         pub attribute: String,
     }
 
     pub struct SelectRequest {
+        /// The DOM selector for the element to scrape.
         pub root_selector: String,
+        /// Scrape requests for the selected element.
         pub attribute_requests: Vec<SelectAttributeRequest>,
     }
 
     pub struct AttributePair {
+        /// Arbitrary ID for the scraped result.
         pub key: String,
+        /// The scraped value. Will be empty if attribute is not available.
         pub value: String,
     }
 
     pub struct AttributeResult {
+        /// The DOM selector for the scraped element.
         pub root_selector: String,
+        /// A list of arbitrary IDs and scraped value pairs.
         pub attribute_pairs: Vec<AttributePair>,
     }
 
     extern "Rust" {
+        /// Extracts DOM attributes from the result of a double fetch.
         fn query_element_attributes(
             html: &CxxString,
             requests: &CxxVector<SelectRequest>,

diff --git a/components/web_discovery/browser/double_fetcher.h b/components/web_discovery/browser/double_fetcher.h
@@ -24,6 +24,11 @@ class SimpleURLLoader;
 
 namespace web_discovery {
 
+// Makes anonymous requests to relevant page URLs, without involvement of the
+// user's session. In the case of search engine result pages, the result of the
+// double fetch will scraped for search engine results for a future submission.
+// Uses `RequestQueue` to persist and schedule double fetches. Requests
+// will be sent on somewhat random intervals averaging to a minute.
 class DoubleFetcher {
  public:
   using FetchedCallback =
@@ -38,6 +43,9 @@ class DoubleFetcher {
   DoubleFetcher(const DoubleFetcher&) = delete;
   DoubleFetcher& operator=(const DoubleFetcher&) = delete;
 
+  // Queues a double fetch for a given URL. The associated data will be stored
+  // beside the queue request, and will be passed to the `FetchedCallback`
+  // upon completion.
   void ScheduleDoubleFetch(const GURL& url, base::Value associated_data);
 
  private:

diff --git a/components/web_discovery/browser/hash_detection.h b/components/web_discovery/browser/hash_detection.h
@@ -12,6 +12,9 @@
 
 namespace web_discovery {
 
+// Uses a pre-trained Markov chain classifier to detect the likelihood
+// of a hash in a given piece of text. Used in privacy guard functions
+// for detecting potentially private URLs/queries.
 bool IsHashLikely(RegexUtil& regex_util,
                   std::string value,
                   double probability_multiplier = 1.0);

diff --git a/components/web_discovery/browser/patterns.h b/components/web_discovery/browser/patterns.h
@@ -21,83 +21,152 @@ class RE2;
 
 namespace web_discovery {
 
-enum class ScrapeRuleType { kStandard, kSearchQuery, kWidgetTitle, kOther };
-enum class PayloadRuleType { kQuery, kSingle };
-enum class PayloadResultType { kSingle, kClustered, kCustom };
+enum class ScrapeRuleType {
+  // Will retrieve a value not defined in the DOM, such as the client country
+  // code or the current url.
+  kStandard,
+  // If the following two types are used for a rule, the value will be marked
+  // as the search query, which will be used for privacy checks.
+  kSearchQuery,
+  kWidgetTitle,
+  // All other rules should have this type. No special processing will be
+  // performed.
+  kOther
+};
+enum class PayloadRuleType {
+  // Coupled with the `kClustered` result type.
+  // All instances of a given attribute will be grouped into a single payload.
+  kQuery,
+  // Coupled with the `kSingle` result type.
+  // Each instance of a given attribute will have its own payload.
+  kSingle
+};
+enum class PayloadResultType {
+  // Coupled with the `kSingle` rule type.
+  kSingle,
+  // Coupled with the `kClustered` rule type.
+  kClustered,
+  // Currently unsupported/ignored.
+  kCustom
+};
 
+// Contains functions for refining the scraped value. The inner vector
+// contains the function name and arguments for the function.
 using RefineFunctionList = std::vector<std::vector<base::Value>>;
 
+// Defines rule for scraping an attribute from a given selected element.
 struct ScrapeRule {
   ScrapeRule();
   ~ScrapeRule();
 
   ScrapeRule(const ScrapeRule&) = delete;
   ScrapeRule& operator=(const ScrapeRule&) = delete;
 
+  // An optional selector for an element within the current selected element.
+  // The attribute will be retrieved from the embedded element.
   std::optional<std::string> sub_selector;
   ScrapeRuleType rule_type;
+  // The name of the attribute to retrieve for a DOM element.
   std::string attribute;
+  // Functions used to refine the retrieved value. See the "func ids" defined
+  // in content_scraper.cc for all possible functions.
   RefineFunctionList functions_applied;
 };
 
+// A map of keys (arbitrary IDs used for storing the scraped result) to scrape
+// rules.
 using ScrapeRuleGroup =
     base::flat_map<std::string, std::unique_ptr<ScrapeRule>>;
 
+// A rule for provided a single key/value pair within the submission payload.
 struct PayloadRule {
   PayloadRule();
   ~PayloadRule();
 
   PayloadRule(const PayloadRule&) = delete;
   PayloadRule& operator=(const PayloadRule&) = delete;
 
+  // The DOM selector of the scraped attribute.
   std::string selector;
+  // The arbitrary key associated with the scraped value.
   std::string key;
+  // If set to true, an array-like Dict (each dict key is an index)
+  // will be rendered.
+  // Each value in the Dict will be a Dict containing all keys/values
+  // associated with the selector. This is commonly used for listing search
+  // results.
   bool is_join = false;
 };
 
+// Contains rules for generating a payload for submission.
 struct PayloadRuleGroup {
   PayloadRuleGroup();
   ~PayloadRuleGroup();
 
   PayloadRuleGroup(const PayloadRuleGroup&) = delete;
   PayloadRuleGroup& operator=(const PayloadRuleGroup&) = delete;
 
+  // An arbitrary ID for the rule group. Current, this isn't used in the
+  // payload.
   std::string key;
   PayloadRuleType rule_type;
   PayloadResultType result_type;
+  // The name of the "action" for the given payload.
   std::string action;
+  // The rules for generating the fields within the payload.
   std::vector<PayloadRule> rules;
 };
 
+// Contains settings and rule groups associated with a particular URL.
 struct PatternsURLDetails {
   PatternsURLDetails();
   ~PatternsURLDetails();
 
   PatternsURLDetails(const PatternsURLDetails&) = delete;
   PatternsURLDetails& operator=(const PatternsURLDetails&) = delete;
 
+  // The regex used to match the URL in the address bar.
   std::unique_ptr<re2::RE2> url_regex;
   bool is_search_engine;
+  // The two or three-letter arbitrary id associated with the site.
   std::string id;
+  // The search path prefix used for constructing private search queries
+  // for double fetching.
   std::optional<std::string> search_template_prefix;
+  // The scraping rules for the site. A map of DOM selectors
+  // to rule groups.
   base::flat_map<std::string, ScrapeRuleGroup> scrape_rule_groups;
+  // The payload generation rules used for generating submissions
+  // from scraped attributes.
   std::vector<PayloadRuleGroup> payload_rule_groups;
 };
 
+// The full "patterns" configuration provided by the Web Discovery server.
+// The configuration provides rules for scraping certain pages.
 struct PatternsGroup {
   PatternsGroup();
   ~PatternsGroup();
 
   PatternsGroup(const PatternsGroup&) = delete;
   PatternsGroup& operator=(const PatternsGroup&) = delete;
 
+  // Checks URL against all URL regexes in either the "normal" or "strict" set,
+  // and returns the URL details/rules if available.
   const PatternsURLDetails* GetMatchingURLPattern(const GURL& url,
                                                   bool is_strict_scrape) const;
 
+  // A list of URLs and rules used for scraping pages in the renderer,
+  // pre-"double fetch". These rules typically scrape simple attributes which
+  // are used to determine whether a page is private (i.e. the search query).
   std::vector<PatternsURLDetails> normal_patterns;
+  // A list of URLS and rules used for scraping contents from a "double fetch".
+  // The rules are usually more involved than the "normal" rules. In the case of
+  // search engine result pages, the rules will be used to retrieve the
+  // search results and any other relevant details.
   std::vector<PatternsURLDetails> strict_patterns;
 };
 
+// Returns nullptr if parsing fails.
 std::unique_ptr<PatternsGroup> ParsePatterns(const std::string& patterns_json);
 
 }  // namespace web_discovery

diff --git a/components/web_discovery/browser/payload_generator.h b/components/web_discovery/browser/payload_generator.h
@@ -20,12 +20,16 @@ namespace web_discovery {
 inline constexpr char kActionKey[] = "action";
 inline constexpr char kInnerPayloadKey[] = "payload";
 
+// Generates "query" messages using the payload generation rules
+// and scraped data for a given site.
 std::vector<base::Value::Dict> GenerateQueryPayloads(
     const ServerConfig& server_config,
     RegexUtil& regex_util,
     const PatternsURLDetails* url_details,
     std::unique_ptr<PageScrapeResult> scrape_result);
 
+// Generates an "alive" message to indicate an opted-in
+// status to the server.
 base::Value::Dict GenerateAlivePayload(const ServerConfig& server_config,
                                        std::string date_hour);
 

diff --git a/components/web_discovery/browser/privacy_guard.h b/components/web_discovery/browser/privacy_guard.h
@@ -14,18 +14,29 @@
 
 namespace web_discovery {
 
+// Checks if a URL is likely to be private based on various criteria.
+// If true, the page should not be investigated or reported.
 bool IsPrivateURLLikely(RegexUtil& regex_util,
                         const GURL& url,
                         const PatternsURLDetails* matching_url_details);
 
+// Determines if a search query is likely to contain private information.
+// If true, the search query should not be investigated or reported.
 bool IsPrivateQueryLikely(RegexUtil& regex_util, const std::string& query);
 
+// Generates a simple search URL (without additional query parameters)
+// based on the original search URL and query. Used for the double fetch
+// to ensure that the user's profile is not involved in the query.
 GURL GeneratePrivateSearchURL(const GURL& original_url,
                               const std::string& query,
                               const PatternsURLDetails& matching_url_details);
 
+// Checks if a URL should be dropped due to its length or content.
+// Currently only used for determining whether to mask a URL
+// in the function below.
 bool ShouldDropLongURL(RegexUtil& regex_util, const GURL& url);
 
+// Masks a URL to protect privacy. Returns nullopt if URL is invalid.
 std::optional<std::string> MaskURL(RegexUtil& regex_util, const GURL& url);
 
 }  // namespace web_discovery

diff --git a/components/web_discovery/browser/regex_util.h b/components/web_discovery/browser/regex_util.h
@@ -16,6 +16,8 @@
 
 namespace web_discovery {
 
+// Lazily creates and caches pre-compiled regexes, mainly used for
+// privacy risk assessment of page URLs/contents.
 class RegexUtil {
  public:
   RegexUtil();

diff --git a/components/web_discovery/browser/reporter.h b/components/web_discovery/browser/reporter.h
@@ -29,6 +29,15 @@ class SimpleURLLoader;
 
 namespace web_discovery {
 
+// Handles all functions required for reporting generated payloads:
+// - zlib compression
+// - ECDH key derivation + key exchange
+// - AES encryption (to prevent eavesdropping by the server proxy)
+// - signing the request using anonymous credentials from the
+//   `CredentialManager` (to prevent Sybil attacks on the server)
+// - performing the request for submission
+// Uses `RequestQueue` to persist and schedule submissions. Reports
+// will be processed on somewhat random intervals averaging to a minute.
 class Reporter {
  public:
   Reporter(PrefService* profile_prefs,
@@ -41,6 +50,7 @@ class Reporter {
   Reporter(const Reporter&) = delete;
   Reporter& operator=(const Reporter&) = delete;
 
+  // Schedule a generated payload for submission.
   void ScheduleSend(base::Value::Dict payload);
 
  private: