From 3cfb538fea7ce8997ffab4b36aa8bb0a10b2871c Mon Sep 17 00:00:00 2001
From: Darnell Andries <dandries@brave.com>
Date: Thu, 27 Jun 2024 14:19:38 -0700
Subject: [PATCH] Add comments for Web Discovery

---
 .../web_discovery/browser/content_scraper.h   | 15 ++++
 .../browser/credential_manager.h              | 15 ++++
 .../web_discovery/browser/credential_signer.h |  9 +++
 .../browser/document_extractor/rs/src/lib.rs  | 12 +++
 .../web_discovery/browser/double_fetcher.h    |  8 ++
 .../web_discovery/browser/hash_detection.h    |  3 +
 components/web_discovery/browser/patterns.h   | 75 ++++++++++++++++++-
 .../web_discovery/browser/payload_generator.h |  4 +
 .../web_discovery/browser/privacy_guard.h     | 11 +++
 components/web_discovery/browser/regex_util.h |  2 +
 components/web_discovery/browser/reporter.h   | 10 +++
 .../web_discovery/browser/request_queue.h     |  6 ++
 .../browser/server_config_loader.h            | 13 ++++
 .../browser/signature_basename.h              | 13 +++-
 components/web_discovery/browser/util.h       | 13 ++++
 .../web_discovery/browser/wdp_service.cc      |  5 +-
 .../web_discovery/browser/wdp_service.h       | 10 ++-
 .../browser/web_discovery_tab_helper.cc       |  2 +-
 components/web_discovery/common/features.h    |  2 +
 .../web_discovery/common/web_discovery.mojom  | 10 +++
 .../renderer/blink_document_extractor.h       |  2 +
 21 files changed, 230 insertions(+), 10 deletions(-)
diff --git a/components/web_discovery/browser/content_scraper.h b/components/web_discovery/browser/content_scraper.h
index 980db82b15b4..68d6cb60178a 100644
--- a/components/web_discovery/browser/content_scraper.h
+++ b/components/web_discovery/browser/content_scraper.h
@@ -33,6 +33,9 @@ struct PageScrapeResult {
   static std::unique_ptr<PageScrapeResult> FromValue(const base::Value& dict);
 
   GURL url;
+  // A map of DOM selectors to list of scraped values embedded in a Dict.
+  // Each dict contains arbitrary keys (defined in the patterns) to scraped
+  // values.
   base::flat_map<std::string, std::vector<base::Value::Dict>> fields;
   std::string id;
 
@@ -41,6 +44,18 @@ struct PageScrapeResult {
   std::optional<std::string> query;
 };
 
+// Extracts attribute values from the page DOM for reporting purposes.
+// ContentScraper utilizes the following techniques:
+//
+// a) Extraction within the current page in the renderer (via `ScrapePage`).
+//    The `mojom::DocumentExtractor` is used to request attribute values
+//    from the current DOM in the view. Typically, this is used to exact a
+//    search query, and decide whether the page is worthy of investigation
+//    and reporting.
+// b) Parsing and extracting HTML from a double fetch. This follows
+//    the extraction in a). Used to extract all other needed details
+//    from the page i.e. search results. Uses a Rust library for DOM
+//    operations, in respect of Rule of Two.
 class ContentScraper {
  public:
   using PageScrapeResultCallback =
diff --git a/components/web_discovery/browser/credential_manager.h b/components/web_discovery/browser/credential_manager.h
index e32333e58f8b..4eb7418484fd 100644
--- a/components/web_discovery/browser/credential_manager.h
+++ b/components/web_discovery/browser/credential_manager.h
@@ -35,6 +35,16 @@ struct GenerateJoinRequestResult {
   std::string signature;
 };
 
+// Manages and utilizes anonymous credentials used for communicating
+// with Web Discovery servers. These Direct Anonymous Attestation credentials
+// are used to prevent Sybil attacks on the servers.
+// The manager provides two key functions:
+//
+// a) "joining": acquires credentials from the Web Discovery server. Join
+// requests
+//    are signed with a random RSA key that is persisted with the profile.
+// b) "signing": uses the previously acquired credentials to sign submissions
+//    which is required in order for the servers to accept the request.
 class CredentialManager : public CredentialSigner {
  public:
   CredentialManager(PrefService* profile_prefs,
@@ -45,6 +55,8 @@ class CredentialManager : public CredentialSigner {
   CredentialManager(const CredentialManager&) = delete;
   CredentialManager& operator=(const CredentialManager&) = delete;
 
+  // Acquires credentials for all dates/"group public keys" published in
+  // the server config, if not stored already.
   void JoinGroups();
 
   // CredentialSigner:
@@ -54,6 +66,9 @@ class CredentialManager : public CredentialSigner {
             std::vector<const uint8_t> basename,
             SignCallback callback) override;
 
+  // Uses a fixed seed in the anonymous credential manager
+  // to provide deterministic credentials & signatures which
+  // are useful for testing.
   void UseFixedSeedForTesting();
 
  private:
diff --git a/components/web_discovery/browser/credential_signer.h b/components/web_discovery/browser/credential_signer.h
index ef80cbb8ddc2..b7e167cf1e49 100644
--- a/components/web_discovery/browser/credential_signer.h
+++ b/components/web_discovery/browser/credential_signer.h
@@ -19,7 +19,16 @@ class CredentialSigner {
       base::OnceCallback<void(std::optional<std::vector<const uint8_t>>)>;
   virtual ~CredentialSigner();
 
+  // Returns true is a credential is available for the current date.
+  // The caller can expect future calls to `Sign` to succeed, if made today.
   virtual bool CredentialExistsForToday() = 0;
+
+  // Signs a message for a given basename. The server has the ability
+  // to check whether two messages with the same basename were signed
+  // with the same credential without revealing the credential used,
+  // preventing Sybil attacks.
+  // See signature_basename.h/cc for more information on how the basename
+  // should be generated.
   virtual bool Sign(std::vector<const uint8_t> msg,
                     std::vector<const uint8_t> basename,
                     SignCallback callback) = 0;
diff --git a/components/web_discovery/browser/document_extractor/rs/src/lib.rs b/components/web_discovery/browser/document_extractor/rs/src/lib.rs
index ac52e8940731..17ce268b19e2 100644
--- a/components/web_discovery/browser/document_extractor/rs/src/lib.rs
+++ b/components/web_discovery/browser/document_extractor/rs/src/lib.rs
@@ -17,27 +17,39 @@ use kuchikiki::{
 #[cxx::bridge(namespace = "rust_document_extractor")]
 mod ffi {
     pub struct SelectAttributeRequest {
+        /// An optional selector for an element within the current selected element.
+        /// The attribute will be retrieved from the embedded element.
+        /// If not needed, an empty string should be provided.
         pub sub_selector: String,
+        /// Arbitrary ID used for storing the scraped result.
         pub key: String,
+        /// Name of the attribute to scrape.
         pub attribute: String,
     }
 
     pub struct SelectRequest {
+        /// The DOM selector for the element to scrape.
         pub root_selector: String,
+        /// Scrape requests for the selected element.
         pub attribute_requests: Vec<SelectAttributeRequest>,
     }
 
     pub struct AttributePair {
+        /// Arbitrary ID for the scraped result.
         pub key: String,
+        /// The scraped value. Will be empty if attribute is not available.
         pub value: String,
     }
 
     pub struct AttributeResult {
+        /// The DOM selector for the scraped element.
         pub root_selector: String,
+        /// A list of arbitrary IDs and scraped value pairs.
         pub attribute_pairs: Vec<AttributePair>,
     }
 
     extern "Rust" {
+        /// Extracts DOM attributes from the result of a double fetch.
         fn query_element_attributes(
             html: &CxxString,
             requests: &CxxVector<SelectRequest>,
diff --git a/components/web_discovery/browser/double_fetcher.h b/components/web_discovery/browser/double_fetcher.h
index 08f5e05bd480..e2b8c979a78d 100644
--- a/components/web_discovery/browser/double_fetcher.h
+++ b/components/web_discovery/browser/double_fetcher.h
@@ -24,6 +24,11 @@ class SimpleURLLoader;
 
 namespace web_discovery {
 
+// Makes anonymous requests to relevant page URLs, without involvement of the
+// user's session. In the case of search engine result pages, the result of the
+// double fetch will scraped for search engine results for a future submission.
+// Uses `RequestQueue` to persist and schedule double fetches. Requests
+// will be sent on somewhat random intervals averaging to a minute.
 class DoubleFetcher {
  public:
   using FetchedCallback =
@@ -38,6 +43,9 @@ class DoubleFetcher {
   DoubleFetcher(const DoubleFetcher&) = delete;
   DoubleFetcher& operator=(const DoubleFetcher&) = delete;
 
+  // Queues a double fetch for a given URL. The associated data will be stored
+  // beside the queue request, and will be passed to the `FetchedCallback`
+  // upon completion.
   void ScheduleDoubleFetch(const GURL& url, base::Value associated_data);
 
  private:
diff --git a/components/web_discovery/browser/hash_detection.h b/components/web_discovery/browser/hash_detection.h
index 376674b4393e..c6461aa40bc4 100644
--- a/components/web_discovery/browser/hash_detection.h
+++ b/components/web_discovery/browser/hash_detection.h
@@ -12,6 +12,9 @@
 
 namespace web_discovery {
 
+// Uses a pre-trained Markov chain classifier to detect the likelihood
+// of a hash in a given piece of text. Used in privacy guard functions
+// for detecting potentially private URLs/queries.
 bool IsHashLikely(RegexUtil& regex_util,
                   std::string value,
                   double probability_multiplier = 1.0);
diff --git a/components/web_discovery/browser/patterns.h b/components/web_discovery/browser/patterns.h
index 6416089eddec..b9e3b749e104 100644
--- a/components/web_discovery/browser/patterns.h
+++ b/components/web_discovery/browser/patterns.h
@@ -21,12 +21,40 @@ class RE2;
 
 namespace web_discovery {
 
-enum class ScrapeRuleType { kStandard, kSearchQuery, kWidgetTitle, kOther };
-enum class PayloadRuleType { kQuery, kSingle };
-enum class PayloadResultType { kSingle, kClustered, kCustom };
+enum class ScrapeRuleType {
+  // Will retrieve a value not defined in the DOM, such as the client country
+  // code or the current url.
+  kStandard,
+  // If the following two types are used for a rule, the value will be marked
+  // as the search query, which will be used for privacy checks.
+  kSearchQuery,
+  kWidgetTitle,
+  // All other rules should have this type. No special processing will be
+  // performed.
+  kOther
+};
+enum class PayloadRuleType {
+  // Coupled with the `kClustered` result type.
+  // All instances of a given attribute will be grouped into a single payload.
+  kQuery,
+  // Coupled with the `kSingle` result type.
+  // Each instance of a given attribute will have its own payload.
+  kSingle
+};
+enum class PayloadResultType {
+  // Coupled with the `kSingle` rule type.
+  kSingle,
+  // Coupled with the `kClustered` rule type.
+  kClustered,
+  // Currently unsupported/ignored.
+  kCustom
+};
 
+// Contains functions for refining the scraped value. The inner vector
+// contains the function name and arguments for the function.
 using RefineFunctionList = std::vector<std::vector<base::Value>>;
 
+// Defines rule for scraping an attribute from a given selected element.
 struct ScrapeRule {
   ScrapeRule();
   ~ScrapeRule();
@@ -34,15 +62,23 @@ struct ScrapeRule {
   ScrapeRule(const ScrapeRule&) = delete;
   ScrapeRule& operator=(const ScrapeRule&) = delete;
 
+  // An optional selector for an element within the current selected element.
+  // The attribute will be retrieved from the embedded element.
   std::optional<std::string> sub_selector;
   ScrapeRuleType rule_type;
+  // The name of the attribute to retrieve for a DOM element.
   std::string attribute;
+  // Functions used to refine the retrieved value. See the "func ids" defined
+  // in content_scraper.cc for all possible functions.
   RefineFunctionList functions_applied;
 };
 
+// A map of keys (arbitrary IDs used for storing the scraped result) to scrape
+// rules.
 using ScrapeRuleGroup =
     base::flat_map<std::string, std::unique_ptr<ScrapeRule>>;
 
+// A rule for provided a single key/value pair within the submission payload.
 struct PayloadRule {
   PayloadRule();
   ~PayloadRule();
@@ -50,11 +86,19 @@ struct PayloadRule {
   PayloadRule(const PayloadRule&) = delete;
   PayloadRule& operator=(const PayloadRule&) = delete;
 
+  // The DOM selector of the scraped attribute.
   std::string selector;
+  // The arbitrary key associated with the scraped value.
   std::string key;
+  // If set to true, an array-like Dict (each dict key is an index)
+  // will be rendered.
+  // Each value in the Dict will be a Dict containing all keys/values
+  // associated with the selector. This is commonly used for listing search
+  // results.
   bool is_join = false;
 };
 
+// Contains rules for generating a payload for submission.
 struct PayloadRuleGroup {
   PayloadRuleGroup();
   ~PayloadRuleGroup();
@@ -62,13 +106,18 @@ struct PayloadRuleGroup {
   PayloadRuleGroup(const PayloadRuleGroup&) = delete;
   PayloadRuleGroup& operator=(const PayloadRuleGroup&) = delete;
 
+  // An arbitrary ID for the rule group. Current, this isn't used in the
+  // payload.
   std::string key;
   PayloadRuleType rule_type;
   PayloadResultType result_type;
+  // The name of the "action" for the given payload.
   std::string action;
+  // The rules for generating the fields within the payload.
   std::vector<PayloadRule> rules;
 };
 
+// Contains settings and rule groups associated with a particular URL.
 struct PatternsURLDetails {
   PatternsURLDetails();
   ~PatternsURLDetails();
@@ -76,14 +125,24 @@ struct PatternsURLDetails {
   PatternsURLDetails(const PatternsURLDetails&) = delete;
   PatternsURLDetails& operator=(const PatternsURLDetails&) = delete;
 
+  // The regex used to match the URL in the address bar.
   std::unique_ptr<re2::RE2> url_regex;
   bool is_search_engine;
+  // The two or three-letter arbitrary id associated with the site.
   std::string id;
+  // The search path prefix used for constructing private search queries
+  // for double fetching.
   std::optional<std::string> search_template_prefix;
+  // The scraping rules for the site. A map of DOM selectors
+  // to rule groups.
   base::flat_map<std::string, ScrapeRuleGroup> scrape_rule_groups;
+  // The payload generation rules used for generating submissions
+  // from scraped attributes.
   std::vector<PayloadRuleGroup> payload_rule_groups;
 };
 
+// The full "patterns" configuration provided by the Web Discovery server.
+// The configuration provides rules for scraping certain pages.
 struct PatternsGroup {
   PatternsGroup();
   ~PatternsGroup();
@@ -91,13 +150,23 @@ struct PatternsGroup {
   PatternsGroup(const PatternsGroup&) = delete;
   PatternsGroup& operator=(const PatternsGroup&) = delete;
 
+  // Checks URL against all URL regexes in either the "normal" or "strict" set,
+  // and returns the URL details/rules if available.
   const PatternsURLDetails* GetMatchingURLPattern(const GURL& url,
                                                   bool is_strict_scrape) const;
 
+  // A list of URLs and rules used for scraping pages in the renderer,
+  // pre-"double fetch". These rules typically scrape simple attributes which
+  // are used to determine whether a page is private (i.e. the search query).
   std::vector<PatternsURLDetails> normal_patterns;
+  // A list of URLS and rules used for scraping contents from a "double fetch".
+  // The rules are usually more involved than the "normal" rules. In the case of
+  // search engine result pages, the rules will be used to retrieve the
+  // search results and any other relevant details.
   std::vector<PatternsURLDetails> strict_patterns;
 };
 
+// Returns nullptr if parsing fails.
 std::unique_ptr<PatternsGroup> ParsePatterns(const std::string& patterns_json);
 
 }  // namespace web_discovery
diff --git a/components/web_discovery/browser/payload_generator.h b/components/web_discovery/browser/payload_generator.h
index 732eff6234fa..b234c5872abc 100644
--- a/components/web_discovery/browser/payload_generator.h
+++ b/components/web_discovery/browser/payload_generator.h
@@ -20,12 +20,16 @@ namespace web_discovery {
 inline constexpr char kActionKey[] = "action";
 inline constexpr char kInnerPayloadKey[] = "payload";
 
+// Generates "query" messages using the payload generation rules
+// and scraped data for a given site.
 std::vector<base::Value::Dict> GenerateQueryPayloads(
     const ServerConfig& server_config,
     RegexUtil& regex_util,
     const PatternsURLDetails* url_details,
     std::unique_ptr<PageScrapeResult> scrape_result);
 
+// Generates an "alive" message to indicate an opted-in
+// status to the server.
 base::Value::Dict GenerateAlivePayload(const ServerConfig& server_config,
                                        std::string date_hour);
 
diff --git a/components/web_discovery/browser/privacy_guard.h b/components/web_discovery/browser/privacy_guard.h
index 38328f4068e2..36c2fcc971b4 100644
--- a/components/web_discovery/browser/privacy_guard.h
+++ b/components/web_discovery/browser/privacy_guard.h
@@ -14,18 +14,29 @@
 
 namespace web_discovery {
 
+// Checks if a URL is likely to be private based on various criteria.
+// If true, the page should not be investigated or reported.
 bool IsPrivateURLLikely(RegexUtil& regex_util,
                         const GURL& url,
                         const PatternsURLDetails* matching_url_details);
 
+// Determines if a search query is likely to contain private information.
+// If true, the search query should not be investigated or reported.
 bool IsPrivateQueryLikely(RegexUtil& regex_util, const std::string& query);
 
+// Generates a simple search URL (without additional query parameters)
+// based on the original search URL and query. Used for the double fetch
+// to ensure that the user's profile is not involved in the query.
 GURL GeneratePrivateSearchURL(const GURL& original_url,
                               const std::string& query,
                               const PatternsURLDetails& matching_url_details);
 
+// Checks if a URL should be dropped due to its length or content.
+// Currently only used for determining whether to mask a URL
+// in the function below.
 bool ShouldDropLongURL(RegexUtil& regex_util, const GURL& url);
 
+// Masks a URL to protect privacy. Returns nullopt if URL is invalid.
 std::optional<std::string> MaskURL(RegexUtil& regex_util, const GURL& url);
 
 }  // namespace web_discovery
diff --git a/components/web_discovery/browser/regex_util.h b/components/web_discovery/browser/regex_util.h
index 970030c55230..e08dad02fd6e 100644
--- a/components/web_discovery/browser/regex_util.h
+++ b/components/web_discovery/browser/regex_util.h
@@ -16,6 +16,8 @@
 
 namespace web_discovery {
 
+// Lazily creates and caches pre-compiled regexes, mainly used for
+// privacy risk assessment of page URLs/contents.
 class RegexUtil {
  public:
   RegexUtil();
diff --git a/components/web_discovery/browser/reporter.h b/components/web_discovery/browser/reporter.h
index a3649b049c72..f79e14fcdbc7 100644
--- a/components/web_discovery/browser/reporter.h
+++ b/components/web_discovery/browser/reporter.h
@@ -29,6 +29,15 @@ class SimpleURLLoader;
 
 namespace web_discovery {
 
+// Handles all functions required for reporting generated payloads:
+// - zlib compression
+// - ECDH key derivation + key exchange
+// - AES encryption (to prevent eavesdropping by the server proxy)
+// - signing the request using anonymous credentials from the
+//   `CredentialManager` (to prevent Sybil attacks on the server)
+// - performing the request for submission
+// Uses `RequestQueue` to persist and schedule submissions. Reports
+// will be processed on somewhat random intervals averaging to a minute.
 class Reporter {
  public:
   Reporter(PrefService* profile_prefs,
@@ -41,6 +50,7 @@ class Reporter {
   Reporter(const Reporter&) = delete;
   Reporter& operator=(const Reporter&) = delete;
 
+  // Schedule a generated payload for submission.
   void ScheduleSend(base::Value::Dict payload);
 
  private:
diff --git a/components/web_discovery/browser/request_queue.h b/components/web_discovery/browser/request_queue.h
index 6fbfbd194b20..fd385bcee22a 100644
--- a/components/web_discovery/browser/request_queue.h
+++ b/components/web_discovery/browser/request_queue.h
@@ -16,6 +16,10 @@ class PrefService;
 
 namespace web_discovery {
 
+// Persists and schedules requests on randomized intervals within
+// an interval range. If request failures exceed the threshold defined in
+// `max_retries`, the request will be dropped from the list. If a persisted
+// request age exceeds `request_max_age`, the request will be dropped.
 class RequestQueue {
  public:
   RequestQueue(
@@ -31,6 +35,8 @@ class RequestQueue {
   RequestQueue(const RequestQueue&) = delete;
   RequestQueue& operator=(const RequestQueue&) = delete;
 
+  // Persist and schedule a request. The arbitrary data will be passed
+  // to `start_request_callback` on the scheduled interval.
   void ScheduleRequest(base::Value request_data);
   // Returns data value if request is deleted from queue, due to the retry limit
   // or success
diff --git a/components/web_discovery/browser/server_config_loader.h b/components/web_discovery/browser/server_config_loader.h
index 797227ca7256..856073249b34 100644
--- a/components/web_discovery/browser/server_config_loader.h
+++ b/components/web_discovery/browser/server_config_loader.h
@@ -58,6 +58,12 @@ struct ServerConfig {
   std::string location;
 };
 
+// Handles retrieval, updating and caching of the following server
+// configurations:
+// - HPN server config: contains public keys, and "source maps" used
+//   for generating basenames.
+// - "quorum" config: contains the country code of the user
+// - patterns: contains the rules for scraping/submission of certain pages
 class ServerConfigLoader {
  public:
   ServerConfigLoader(PrefService* local_state,
@@ -70,9 +76,16 @@ class ServerConfigLoader {
   ServerConfigLoader(const ServerConfigLoader&) = delete;
   ServerConfigLoader& operator=(const ServerConfigLoader&) = delete;
 
+  // Loads all three server configurations. Update requests will be scheduled
+  // once complete.
   void LoadConfigs();
 
+  // Returns the last loaded server config, which is a combination of the
+  // HPN and "quorum" configs. May only call after the config_callback is
+  // triggered.
   const ServerConfig& GetLastServerConfig() const;
+  // Returns the pattern config. May only call after the patterns_callback is
+  // triggered.
   const PatternsGroup& GetLastPatterns() const;
 
   void SetLastServerConfigForTesting(
diff --git a/components/web_discovery/browser/signature_basename.h b/components/web_discovery/browser/signature_basename.h
index fb6101608649..1a07cf027e25 100644
--- a/components/web_discovery/browser/signature_basename.h
+++ b/components/web_discovery/browser/signature_basename.h
@@ -33,16 +33,27 @@ struct BasenameResult {
   uint32_t count_tag_hash;
 };
 
+// Generates a basename used for the signature. The basename is a sha hash
+// of the message "action" (i.e. "query"), the settings for that action
+// (defined in the server's "source map"), cherry-picked attributes from the
+// payload and the count index for the given message. The count will be under
+// the limit defined for the action; the function will return nullopt if the
+// limit for the action is exceeded.
 std::optional<BasenameResult> GenerateBasename(
     PrefService* profile_prefs,
     const ServerConfig& server_config,
     RegexUtil& regex_util,
     const base::Value::Dict& payload);
 
+// Saves the count returned from `GenerateBasename` in the prefs.
+// This ensures that the count index cannot be used for future messages
+// within the defined action limit period (default is 24 hours).
+// This should be called after a submission is successfully sent to
+// the server.
 void SaveBasenameCount(PrefService* profile_prefs,
                        uint32_t count_tag_hash,
                        size_t count);
 
 }  // namespace web_discovery
 
-#endif  // BRAVE_COMPONENTS_WEB_DISCOVERY_BROWSER_PAYLOAD_GENERATOR_H_
+#endif  // BRAVE_COMPONENTS_WEB_DISCOVERY_BROWSER_SIGNATURE_BASENAME_H_
diff --git a/components/web_discovery/browser/util.h b/components/web_discovery/browser/util.h
index 4a00b6c17262..9792b4622d55 100644
--- a/components/web_discovery/browser/util.h
+++ b/components/web_discovery/browser/util.h
@@ -22,6 +22,7 @@ inline constexpr char kCollectorHostSwitch[] = "wdp-collector-host";
 inline constexpr char kVersionHeader[] = "Version";
 inline constexpr int kCurrentVersion = 1;
 
+// The default backoff policy to use for scheduling retry requests.
 inline constexpr net::BackoffEntry::Policy kBackoffPolicy = {
     .num_errors_to_ignore = 0,
     .initial_delay_ms = 10 * 1000,
@@ -31,17 +32,29 @@ inline constexpr net::BackoffEntry::Policy kBackoffPolicy = {
     .entry_lifetime_ms = -1,
     .always_use_initial_delay = false};
 
+// Returns the non-proxied HPN host, used for acquiring anonymous credentials.
 std::string GetDirectHPNHost();
+// Returns the proxied HPN host, used for retrieving server config and page
+// content submission.
 std::string GetAnonymousHPNHost();
+// Returns the "quorum" host, used for location config and page event
+// submission.
 std::string GetQuorumHost();
+// Returns the full URL for the patterns config.
 GURL GetPatternsEndpoint();
 
+// Creates a new ResourceRequest with the given URL and credentials omitted.
 std::unique_ptr<network::ResourceRequest> CreateResourceRequest(GURL url);
 
+// Formats a given date as a string in the format "YYYYMMDD", in the UTC
+// timezone.
 std::string FormatServerDate(const base::Time& date);
 
+// Decodes URL-encoded components, converting escape sequences to their
+// corresponding characters.
 std::string DecodeURLComponent(const std::string_view value);
 
+// Extracts the value associated with a given key from a URL query string.
 std::optional<std::string> ExtractValueFromQueryString(
     const std::string_view query_string,
     const std::string_view key);
diff --git a/components/web_discovery/browser/wdp_service.cc b/components/web_discovery/browser/wdp_service.cc
index c2cea0ca4a25..a5c31e29d303 100644
--- a/components/web_discovery/browser/wdp_service.cc
+++ b/components/web_discovery/browser/wdp_service.cc
@@ -158,9 +158,8 @@ void WDPService::OnDoubleFetched(const GURL& url,
                      true));
 }
 
-void WDPService::OnFinishNavigation(
-    const GURL& url,
-    content::RenderFrameHost* render_frame_host) {
+void WDPService::DidFinishLoad(const GURL& url,
+                               content::RenderFrameHost* render_frame_host) {
   if (!content_scraper_) {
     return;
   }
diff --git a/components/web_discovery/browser/wdp_service.h b/components/web_discovery/browser/wdp_service.h
index f2ea1f068b35..6549dcf59356 100644
--- a/components/web_discovery/browser/wdp_service.h
+++ b/components/web_discovery/browser/wdp_service.h
@@ -35,6 +35,8 @@ class SharedURLLoaderFactory;
 
 namespace web_discovery {
 
+// The main service for the native re-implementation of Web Discovery Project.
+// Handles scraping and reporting of relevant pages for opted-in users.
 class WDPService : public KeyedService {
  public:
   WDPService(
@@ -50,10 +52,14 @@ class WDPService : public KeyedService {
   static void RegisterLocalStatePrefs(PrefRegistrySimple* registry);
   static void RegisterProfilePrefs(PrefRegistrySimple* registry);
 
+  // Sets the extension preference to true if the preference for the native
+  // implementation is set to true and the feature is disabled.
+  // Relevant for a Griffin/variations rollback.
   static void SetExtensionPrefIfNativeDisabled(PrefService* profile_prefs);
 
-  void OnFinishNavigation(const GURL& url,
-                          content::RenderFrameHost* render_frame_host);
+  // Called by `WebDiscoveryTabHelper` to notify on a page load.
+  void DidFinishLoad(const GURL& url,
+                     content::RenderFrameHost* render_frame_host);
 
  private:
   void Start();
diff --git a/components/web_discovery/browser/web_discovery_tab_helper.cc b/components/web_discovery/browser/web_discovery_tab_helper.cc
index 98c120e2be25..023e260fe501 100644
--- a/components/web_discovery/browser/web_discovery_tab_helper.cc
+++ b/components/web_discovery/browser/web_discovery_tab_helper.cc
@@ -26,7 +26,7 @@ void WebDiscoveryTabHelper::DidFinishLoad(
   if (!render_frame_host->IsInPrimaryMainFrame()) {
     return;
   }
-  wdp_service_->OnFinishNavigation(url, render_frame_host);
+  wdp_service_->DidFinishLoad(url, render_frame_host);
 }
 
 WEB_CONTENTS_USER_DATA_KEY_IMPL(WebDiscoveryTabHelper);
diff --git a/components/web_discovery/common/features.h b/components/web_discovery/common/features.h
index 2083a40ec39c..45db0afd2555 100644
--- a/components/web_discovery/common/features.h
+++ b/components/web_discovery/common/features.h
@@ -10,6 +10,8 @@
 
 namespace web_discovery::features {
 
+// Enables the native re-implementation of the Web Discovery Project.
+// If enabled, the Web Discovery component of the extension should be disabled.
 BASE_DECLARE_FEATURE(kWebDiscoveryNative);
 
 }  // namespace web_discovery::features
diff --git a/components/web_discovery/common/web_discovery.mojom b/components/web_discovery/common/web_discovery.mojom
index 709d3bab3a57..94de01e15d6d 100644
--- a/components/web_discovery/common/web_discovery.mojom
+++ b/components/web_discovery/common/web_discovery.mojom
@@ -6,21 +6,31 @@
 module web_discovery.mojom;
 
 struct SelectAttributeRequest {
+  // An optional selector for an element within the current selected element.
+  // The attribute will be retrieved from the embedded element.
   string? sub_selector;
+  // Arbitrary ID used for storing the scraped result.
   string key;
+  // Name of the attribute to scrape.
   string attribute;
 };
 
 struct SelectRequest {
+  // The DOM selector for the element to scrape.
   string root_selector;
+  // Scrape requests for the selected element.
   array<SelectAttributeRequest> attribute_requests;
 };
 
 struct AttributeResult {
+  // The DOM selector for the scraped element.
   string root_selector;
+  // A map of arbitrary IDs to scraped results. Value will be set to
+  // nullopt if the attribute was not available.
   map<string, string?> attribute_values;
 };
 
 interface DocumentExtractor {
+  // Extracts DOM attributes from the current page in renderer.
   QueryElementAttributes(array<SelectRequest> requests) => (array<AttributeResult> results);
 };
diff --git a/components/web_discovery/renderer/blink_document_extractor.h b/components/web_discovery/renderer/blink_document_extractor.h
index ce4e19c4b252..b502872395c3 100644
--- a/components/web_discovery/renderer/blink_document_extractor.h
+++ b/components/web_discovery/renderer/blink_document_extractor.h
@@ -16,6 +16,8 @@
 
 namespace web_discovery {
 
+// Extracts attributes from the current page
+// for the native re-implementation of Web Discovery.
 class BlinkDocumentExtractor : public content::RenderFrameObserver,
                                public mojom::DocumentExtractor {
  public: