From 3cfb538fea7ce8997ffab4b36aa8bb0a10b2871c Mon Sep 17 00:00:00 2001 From: Darnell Andries Date: Thu, 27 Jun 2024 14:19:38 -0700 Subject: [PATCH] Add comments for Web Discovery --- .../web_discovery/browser/content_scraper.h | 15 ++++ .../browser/credential_manager.h | 15 ++++ .../web_discovery/browser/credential_signer.h | 9 +++ .../browser/document_extractor/rs/src/lib.rs | 12 +++ .../web_discovery/browser/double_fetcher.h | 8 ++ .../web_discovery/browser/hash_detection.h | 3 + components/web_discovery/browser/patterns.h | 75 ++++++++++++++++++- .../web_discovery/browser/payload_generator.h | 4 + .../web_discovery/browser/privacy_guard.h | 11 +++ components/web_discovery/browser/regex_util.h | 2 + components/web_discovery/browser/reporter.h | 10 +++ .../web_discovery/browser/request_queue.h | 6 ++ .../browser/server_config_loader.h | 13 ++++ .../browser/signature_basename.h | 13 +++- components/web_discovery/browser/util.h | 13 ++++ .../web_discovery/browser/wdp_service.cc | 5 +- .../web_discovery/browser/wdp_service.h | 10 ++- .../browser/web_discovery_tab_helper.cc | 2 +- components/web_discovery/common/features.h | 2 + .../web_discovery/common/web_discovery.mojom | 10 +++ .../renderer/blink_document_extractor.h | 2 + 21 files changed, 230 insertions(+), 10 deletions(-) diff --git a/components/web_discovery/browser/content_scraper.h b/components/web_discovery/browser/content_scraper.h index 980db82b15b4..68d6cb60178a 100644 --- a/components/web_discovery/browser/content_scraper.h +++ b/components/web_discovery/browser/content_scraper.h @@ -33,6 +33,9 @@ struct PageScrapeResult { static std::unique_ptr FromValue(const base::Value& dict); GURL url; + // A map of DOM selectors to list of scraped values embedded in a Dict. + // Each dict contains arbitrary keys (defined in the patterns) to scraped + // values. base::flat_map> fields; std::string id; @@ -41,6 +44,18 @@ struct PageScrapeResult { std::optional query; }; +// Extracts attribute values from the page DOM for reporting purposes. +// ContentScraper utilizes the following techniques: +// +// a) Extraction within the current page in the renderer (via `ScrapePage`). +// The `mojom::DocumentExtractor` is used to request attribute values +// from the current DOM in the view. Typically, this is used to exact a +// search query, and decide whether the page is worthy of investigation +// and reporting. +// b) Parsing and extracting HTML from a double fetch. This follows +// the extraction in a). Used to extract all other needed details +// from the page i.e. search results. Uses a Rust library for DOM +// operations, in respect of Rule of Two. class ContentScraper { public: using PageScrapeResultCallback = diff --git a/components/web_discovery/browser/credential_manager.h b/components/web_discovery/browser/credential_manager.h index e32333e58f8b..4eb7418484fd 100644 --- a/components/web_discovery/browser/credential_manager.h +++ b/components/web_discovery/browser/credential_manager.h @@ -35,6 +35,16 @@ struct GenerateJoinRequestResult { std::string signature; }; +// Manages and utilizes anonymous credentials used for communicating +// with Web Discovery servers. These Direct Anonymous Attestation credentials +// are used to prevent Sybil attacks on the servers. +// The manager provides two key functions: +// +// a) "joining": acquires credentials from the Web Discovery server. Join +// requests +// are signed with a random RSA key that is persisted with the profile. +// b) "signing": uses the previously acquired credentials to sign submissions +// which is required in order for the servers to accept the request. class CredentialManager : public CredentialSigner { public: CredentialManager(PrefService* profile_prefs, @@ -45,6 +55,8 @@ class CredentialManager : public CredentialSigner { CredentialManager(const CredentialManager&) = delete; CredentialManager& operator=(const CredentialManager&) = delete; + // Acquires credentials for all dates/"group public keys" published in + // the server config, if not stored already. void JoinGroups(); // CredentialSigner: @@ -54,6 +66,9 @@ class CredentialManager : public CredentialSigner { std::vector basename, SignCallback callback) override; + // Uses a fixed seed in the anonymous credential manager + // to provide deterministic credentials & signatures which + // are useful for testing. void UseFixedSeedForTesting(); private: diff --git a/components/web_discovery/browser/credential_signer.h b/components/web_discovery/browser/credential_signer.h index ef80cbb8ddc2..b7e167cf1e49 100644 --- a/components/web_discovery/browser/credential_signer.h +++ b/components/web_discovery/browser/credential_signer.h @@ -19,7 +19,16 @@ class CredentialSigner { base::OnceCallback>)>; virtual ~CredentialSigner(); + // Returns true is a credential is available for the current date. + // The caller can expect future calls to `Sign` to succeed, if made today. virtual bool CredentialExistsForToday() = 0; + + // Signs a message for a given basename. The server has the ability + // to check whether two messages with the same basename were signed + // with the same credential without revealing the credential used, + // preventing Sybil attacks. + // See signature_basename.h/cc for more information on how the basename + // should be generated. virtual bool Sign(std::vector msg, std::vector basename, SignCallback callback) = 0; diff --git a/components/web_discovery/browser/document_extractor/rs/src/lib.rs b/components/web_discovery/browser/document_extractor/rs/src/lib.rs index ac52e8940731..17ce268b19e2 100644 --- a/components/web_discovery/browser/document_extractor/rs/src/lib.rs +++ b/components/web_discovery/browser/document_extractor/rs/src/lib.rs @@ -17,27 +17,39 @@ use kuchikiki::{ #[cxx::bridge(namespace = "rust_document_extractor")] mod ffi { pub struct SelectAttributeRequest { + /// An optional selector for an element within the current selected element. + /// The attribute will be retrieved from the embedded element. + /// If not needed, an empty string should be provided. pub sub_selector: String, + /// Arbitrary ID used for storing the scraped result. pub key: String, + /// Name of the attribute to scrape. pub attribute: String, } pub struct SelectRequest { + /// The DOM selector for the element to scrape. pub root_selector: String, + /// Scrape requests for the selected element. pub attribute_requests: Vec, } pub struct AttributePair { + /// Arbitrary ID for the scraped result. pub key: String, + /// The scraped value. Will be empty if attribute is not available. pub value: String, } pub struct AttributeResult { + /// The DOM selector for the scraped element. pub root_selector: String, + /// A list of arbitrary IDs and scraped value pairs. pub attribute_pairs: Vec, } extern "Rust" { + /// Extracts DOM attributes from the result of a double fetch. fn query_element_attributes( html: &CxxString, requests: &CxxVector, diff --git a/components/web_discovery/browser/double_fetcher.h b/components/web_discovery/browser/double_fetcher.h index 08f5e05bd480..e2b8c979a78d 100644 --- a/components/web_discovery/browser/double_fetcher.h +++ b/components/web_discovery/browser/double_fetcher.h @@ -24,6 +24,11 @@ class SimpleURLLoader; namespace web_discovery { +// Makes anonymous requests to relevant page URLs, without involvement of the +// user's session. In the case of search engine result pages, the result of the +// double fetch will scraped for search engine results for a future submission. +// Uses `RequestQueue` to persist and schedule double fetches. Requests +// will be sent on somewhat random intervals averaging to a minute. class DoubleFetcher { public: using FetchedCallback = @@ -38,6 +43,9 @@ class DoubleFetcher { DoubleFetcher(const DoubleFetcher&) = delete; DoubleFetcher& operator=(const DoubleFetcher&) = delete; + // Queues a double fetch for a given URL. The associated data will be stored + // beside the queue request, and will be passed to the `FetchedCallback` + // upon completion. void ScheduleDoubleFetch(const GURL& url, base::Value associated_data); private: diff --git a/components/web_discovery/browser/hash_detection.h b/components/web_discovery/browser/hash_detection.h index 376674b4393e..c6461aa40bc4 100644 --- a/components/web_discovery/browser/hash_detection.h +++ b/components/web_discovery/browser/hash_detection.h @@ -12,6 +12,9 @@ namespace web_discovery { +// Uses a pre-trained Markov chain classifier to detect the likelihood +// of a hash in a given piece of text. Used in privacy guard functions +// for detecting potentially private URLs/queries. bool IsHashLikely(RegexUtil& regex_util, std::string value, double probability_multiplier = 1.0); diff --git a/components/web_discovery/browser/patterns.h b/components/web_discovery/browser/patterns.h index 6416089eddec..b9e3b749e104 100644 --- a/components/web_discovery/browser/patterns.h +++ b/components/web_discovery/browser/patterns.h @@ -21,12 +21,40 @@ class RE2; namespace web_discovery { -enum class ScrapeRuleType { kStandard, kSearchQuery, kWidgetTitle, kOther }; -enum class PayloadRuleType { kQuery, kSingle }; -enum class PayloadResultType { kSingle, kClustered, kCustom }; +enum class ScrapeRuleType { + // Will retrieve a value not defined in the DOM, such as the client country + // code or the current url. + kStandard, + // If the following two types are used for a rule, the value will be marked + // as the search query, which will be used for privacy checks. + kSearchQuery, + kWidgetTitle, + // All other rules should have this type. No special processing will be + // performed. + kOther +}; +enum class PayloadRuleType { + // Coupled with the `kClustered` result type. + // All instances of a given attribute will be grouped into a single payload. + kQuery, + // Coupled with the `kSingle` result type. + // Each instance of a given attribute will have its own payload. + kSingle +}; +enum class PayloadResultType { + // Coupled with the `kSingle` rule type. + kSingle, + // Coupled with the `kClustered` rule type. + kClustered, + // Currently unsupported/ignored. + kCustom +}; +// Contains functions for refining the scraped value. The inner vector +// contains the function name and arguments for the function. using RefineFunctionList = std::vector>; +// Defines rule for scraping an attribute from a given selected element. struct ScrapeRule { ScrapeRule(); ~ScrapeRule(); @@ -34,15 +62,23 @@ struct ScrapeRule { ScrapeRule(const ScrapeRule&) = delete; ScrapeRule& operator=(const ScrapeRule&) = delete; + // An optional selector for an element within the current selected element. + // The attribute will be retrieved from the embedded element. std::optional sub_selector; ScrapeRuleType rule_type; + // The name of the attribute to retrieve for a DOM element. std::string attribute; + // Functions used to refine the retrieved value. See the "func ids" defined + // in content_scraper.cc for all possible functions. RefineFunctionList functions_applied; }; +// A map of keys (arbitrary IDs used for storing the scraped result) to scrape +// rules. using ScrapeRuleGroup = base::flat_map>; +// A rule for provided a single key/value pair within the submission payload. struct PayloadRule { PayloadRule(); ~PayloadRule(); @@ -50,11 +86,19 @@ struct PayloadRule { PayloadRule(const PayloadRule&) = delete; PayloadRule& operator=(const PayloadRule&) = delete; + // The DOM selector of the scraped attribute. std::string selector; + // The arbitrary key associated with the scraped value. std::string key; + // If set to true, an array-like Dict (each dict key is an index) + // will be rendered. + // Each value in the Dict will be a Dict containing all keys/values + // associated with the selector. This is commonly used for listing search + // results. bool is_join = false; }; +// Contains rules for generating a payload for submission. struct PayloadRuleGroup { PayloadRuleGroup(); ~PayloadRuleGroup(); @@ -62,13 +106,18 @@ struct PayloadRuleGroup { PayloadRuleGroup(const PayloadRuleGroup&) = delete; PayloadRuleGroup& operator=(const PayloadRuleGroup&) = delete; + // An arbitrary ID for the rule group. Current, this isn't used in the + // payload. std::string key; PayloadRuleType rule_type; PayloadResultType result_type; + // The name of the "action" for the given payload. std::string action; + // The rules for generating the fields within the payload. std::vector rules; }; +// Contains settings and rule groups associated with a particular URL. struct PatternsURLDetails { PatternsURLDetails(); ~PatternsURLDetails(); @@ -76,14 +125,24 @@ struct PatternsURLDetails { PatternsURLDetails(const PatternsURLDetails&) = delete; PatternsURLDetails& operator=(const PatternsURLDetails&) = delete; + // The regex used to match the URL in the address bar. std::unique_ptr url_regex; bool is_search_engine; + // The two or three-letter arbitrary id associated with the site. std::string id; + // The search path prefix used for constructing private search queries + // for double fetching. std::optional search_template_prefix; + // The scraping rules for the site. A map of DOM selectors + // to rule groups. base::flat_map scrape_rule_groups; + // The payload generation rules used for generating submissions + // from scraped attributes. std::vector payload_rule_groups; }; +// The full "patterns" configuration provided by the Web Discovery server. +// The configuration provides rules for scraping certain pages. struct PatternsGroup { PatternsGroup(); ~PatternsGroup(); @@ -91,13 +150,23 @@ struct PatternsGroup { PatternsGroup(const PatternsGroup&) = delete; PatternsGroup& operator=(const PatternsGroup&) = delete; + // Checks URL against all URL regexes in either the "normal" or "strict" set, + // and returns the URL details/rules if available. const PatternsURLDetails* GetMatchingURLPattern(const GURL& url, bool is_strict_scrape) const; + // A list of URLs and rules used for scraping pages in the renderer, + // pre-"double fetch". These rules typically scrape simple attributes which + // are used to determine whether a page is private (i.e. the search query). std::vector normal_patterns; + // A list of URLS and rules used for scraping contents from a "double fetch". + // The rules are usually more involved than the "normal" rules. In the case of + // search engine result pages, the rules will be used to retrieve the + // search results and any other relevant details. std::vector strict_patterns; }; +// Returns nullptr if parsing fails. std::unique_ptr ParsePatterns(const std::string& patterns_json); } // namespace web_discovery diff --git a/components/web_discovery/browser/payload_generator.h b/components/web_discovery/browser/payload_generator.h index 732eff6234fa..b234c5872abc 100644 --- a/components/web_discovery/browser/payload_generator.h +++ b/components/web_discovery/browser/payload_generator.h @@ -20,12 +20,16 @@ namespace web_discovery { inline constexpr char kActionKey[] = "action"; inline constexpr char kInnerPayloadKey[] = "payload"; +// Generates "query" messages using the payload generation rules +// and scraped data for a given site. std::vector GenerateQueryPayloads( const ServerConfig& server_config, RegexUtil& regex_util, const PatternsURLDetails* url_details, std::unique_ptr scrape_result); +// Generates an "alive" message to indicate an opted-in +// status to the server. base::Value::Dict GenerateAlivePayload(const ServerConfig& server_config, std::string date_hour); diff --git a/components/web_discovery/browser/privacy_guard.h b/components/web_discovery/browser/privacy_guard.h index 38328f4068e2..36c2fcc971b4 100644 --- a/components/web_discovery/browser/privacy_guard.h +++ b/components/web_discovery/browser/privacy_guard.h @@ -14,18 +14,29 @@ namespace web_discovery { +// Checks if a URL is likely to be private based on various criteria. +// If true, the page should not be investigated or reported. bool IsPrivateURLLikely(RegexUtil& regex_util, const GURL& url, const PatternsURLDetails* matching_url_details); +// Determines if a search query is likely to contain private information. +// If true, the search query should not be investigated or reported. bool IsPrivateQueryLikely(RegexUtil& regex_util, const std::string& query); +// Generates a simple search URL (without additional query parameters) +// based on the original search URL and query. Used for the double fetch +// to ensure that the user's profile is not involved in the query. GURL GeneratePrivateSearchURL(const GURL& original_url, const std::string& query, const PatternsURLDetails& matching_url_details); +// Checks if a URL should be dropped due to its length or content. +// Currently only used for determining whether to mask a URL +// in the function below. bool ShouldDropLongURL(RegexUtil& regex_util, const GURL& url); +// Masks a URL to protect privacy. Returns nullopt if URL is invalid. std::optional MaskURL(RegexUtil& regex_util, const GURL& url); } // namespace web_discovery diff --git a/components/web_discovery/browser/regex_util.h b/components/web_discovery/browser/regex_util.h index 970030c55230..e08dad02fd6e 100644 --- a/components/web_discovery/browser/regex_util.h +++ b/components/web_discovery/browser/regex_util.h @@ -16,6 +16,8 @@ namespace web_discovery { +// Lazily creates and caches pre-compiled regexes, mainly used for +// privacy risk assessment of page URLs/contents. class RegexUtil { public: RegexUtil(); diff --git a/components/web_discovery/browser/reporter.h b/components/web_discovery/browser/reporter.h index a3649b049c72..f79e14fcdbc7 100644 --- a/components/web_discovery/browser/reporter.h +++ b/components/web_discovery/browser/reporter.h @@ -29,6 +29,15 @@ class SimpleURLLoader; namespace web_discovery { +// Handles all functions required for reporting generated payloads: +// - zlib compression +// - ECDH key derivation + key exchange +// - AES encryption (to prevent eavesdropping by the server proxy) +// - signing the request using anonymous credentials from the +// `CredentialManager` (to prevent Sybil attacks on the server) +// - performing the request for submission +// Uses `RequestQueue` to persist and schedule submissions. Reports +// will be processed on somewhat random intervals averaging to a minute. class Reporter { public: Reporter(PrefService* profile_prefs, @@ -41,6 +50,7 @@ class Reporter { Reporter(const Reporter&) = delete; Reporter& operator=(const Reporter&) = delete; + // Schedule a generated payload for submission. void ScheduleSend(base::Value::Dict payload); private: diff --git a/components/web_discovery/browser/request_queue.h b/components/web_discovery/browser/request_queue.h index 6fbfbd194b20..fd385bcee22a 100644 --- a/components/web_discovery/browser/request_queue.h +++ b/components/web_discovery/browser/request_queue.h @@ -16,6 +16,10 @@ class PrefService; namespace web_discovery { +// Persists and schedules requests on randomized intervals within +// an interval range. If request failures exceed the threshold defined in +// `max_retries`, the request will be dropped from the list. If a persisted +// request age exceeds `request_max_age`, the request will be dropped. class RequestQueue { public: RequestQueue( @@ -31,6 +35,8 @@ class RequestQueue { RequestQueue(const RequestQueue&) = delete; RequestQueue& operator=(const RequestQueue&) = delete; + // Persist and schedule a request. The arbitrary data will be passed + // to `start_request_callback` on the scheduled interval. void ScheduleRequest(base::Value request_data); // Returns data value if request is deleted from queue, due to the retry limit // or success diff --git a/components/web_discovery/browser/server_config_loader.h b/components/web_discovery/browser/server_config_loader.h index 797227ca7256..856073249b34 100644 --- a/components/web_discovery/browser/server_config_loader.h +++ b/components/web_discovery/browser/server_config_loader.h @@ -58,6 +58,12 @@ struct ServerConfig { std::string location; }; +// Handles retrieval, updating and caching of the following server +// configurations: +// - HPN server config: contains public keys, and "source maps" used +// for generating basenames. +// - "quorum" config: contains the country code of the user +// - patterns: contains the rules for scraping/submission of certain pages class ServerConfigLoader { public: ServerConfigLoader(PrefService* local_state, @@ -70,9 +76,16 @@ class ServerConfigLoader { ServerConfigLoader(const ServerConfigLoader&) = delete; ServerConfigLoader& operator=(const ServerConfigLoader&) = delete; + // Loads all three server configurations. Update requests will be scheduled + // once complete. void LoadConfigs(); + // Returns the last loaded server config, which is a combination of the + // HPN and "quorum" configs. May only call after the config_callback is + // triggered. const ServerConfig& GetLastServerConfig() const; + // Returns the pattern config. May only call after the patterns_callback is + // triggered. const PatternsGroup& GetLastPatterns() const; void SetLastServerConfigForTesting( diff --git a/components/web_discovery/browser/signature_basename.h b/components/web_discovery/browser/signature_basename.h index fb6101608649..1a07cf027e25 100644 --- a/components/web_discovery/browser/signature_basename.h +++ b/components/web_discovery/browser/signature_basename.h @@ -33,16 +33,27 @@ struct BasenameResult { uint32_t count_tag_hash; }; +// Generates a basename used for the signature. The basename is a sha hash +// of the message "action" (i.e. "query"), the settings for that action +// (defined in the server's "source map"), cherry-picked attributes from the +// payload and the count index for the given message. The count will be under +// the limit defined for the action; the function will return nullopt if the +// limit for the action is exceeded. std::optional GenerateBasename( PrefService* profile_prefs, const ServerConfig& server_config, RegexUtil& regex_util, const base::Value::Dict& payload); +// Saves the count returned from `GenerateBasename` in the prefs. +// This ensures that the count index cannot be used for future messages +// within the defined action limit period (default is 24 hours). +// This should be called after a submission is successfully sent to +// the server. void SaveBasenameCount(PrefService* profile_prefs, uint32_t count_tag_hash, size_t count); } // namespace web_discovery -#endif // BRAVE_COMPONENTS_WEB_DISCOVERY_BROWSER_PAYLOAD_GENERATOR_H_ +#endif // BRAVE_COMPONENTS_WEB_DISCOVERY_BROWSER_SIGNATURE_BASENAME_H_ diff --git a/components/web_discovery/browser/util.h b/components/web_discovery/browser/util.h index 4a00b6c17262..9792b4622d55 100644 --- a/components/web_discovery/browser/util.h +++ b/components/web_discovery/browser/util.h @@ -22,6 +22,7 @@ inline constexpr char kCollectorHostSwitch[] = "wdp-collector-host"; inline constexpr char kVersionHeader[] = "Version"; inline constexpr int kCurrentVersion = 1; +// The default backoff policy to use for scheduling retry requests. inline constexpr net::BackoffEntry::Policy kBackoffPolicy = { .num_errors_to_ignore = 0, .initial_delay_ms = 10 * 1000, @@ -31,17 +32,29 @@ inline constexpr net::BackoffEntry::Policy kBackoffPolicy = { .entry_lifetime_ms = -1, .always_use_initial_delay = false}; +// Returns the non-proxied HPN host, used for acquiring anonymous credentials. std::string GetDirectHPNHost(); +// Returns the proxied HPN host, used for retrieving server config and page +// content submission. std::string GetAnonymousHPNHost(); +// Returns the "quorum" host, used for location config and page event +// submission. std::string GetQuorumHost(); +// Returns the full URL for the patterns config. GURL GetPatternsEndpoint(); +// Creates a new ResourceRequest with the given URL and credentials omitted. std::unique_ptr CreateResourceRequest(GURL url); +// Formats a given date as a string in the format "YYYYMMDD", in the UTC +// timezone. std::string FormatServerDate(const base::Time& date); +// Decodes URL-encoded components, converting escape sequences to their +// corresponding characters. std::string DecodeURLComponent(const std::string_view value); +// Extracts the value associated with a given key from a URL query string. std::optional ExtractValueFromQueryString( const std::string_view query_string, const std::string_view key); diff --git a/components/web_discovery/browser/wdp_service.cc b/components/web_discovery/browser/wdp_service.cc index c2cea0ca4a25..a5c31e29d303 100644 --- a/components/web_discovery/browser/wdp_service.cc +++ b/components/web_discovery/browser/wdp_service.cc @@ -158,9 +158,8 @@ void WDPService::OnDoubleFetched(const GURL& url, true)); } -void WDPService::OnFinishNavigation( - const GURL& url, - content::RenderFrameHost* render_frame_host) { +void WDPService::DidFinishLoad(const GURL& url, + content::RenderFrameHost* render_frame_host) { if (!content_scraper_) { return; } diff --git a/components/web_discovery/browser/wdp_service.h b/components/web_discovery/browser/wdp_service.h index f2ea1f068b35..6549dcf59356 100644 --- a/components/web_discovery/browser/wdp_service.h +++ b/components/web_discovery/browser/wdp_service.h @@ -35,6 +35,8 @@ class SharedURLLoaderFactory; namespace web_discovery { +// The main service for the native re-implementation of Web Discovery Project. +// Handles scraping and reporting of relevant pages for opted-in users. class WDPService : public KeyedService { public: WDPService( @@ -50,10 +52,14 @@ class WDPService : public KeyedService { static void RegisterLocalStatePrefs(PrefRegistrySimple* registry); static void RegisterProfilePrefs(PrefRegistrySimple* registry); + // Sets the extension preference to true if the preference for the native + // implementation is set to true and the feature is disabled. + // Relevant for a Griffin/variations rollback. static void SetExtensionPrefIfNativeDisabled(PrefService* profile_prefs); - void OnFinishNavigation(const GURL& url, - content::RenderFrameHost* render_frame_host); + // Called by `WebDiscoveryTabHelper` to notify on a page load. + void DidFinishLoad(const GURL& url, + content::RenderFrameHost* render_frame_host); private: void Start(); diff --git a/components/web_discovery/browser/web_discovery_tab_helper.cc b/components/web_discovery/browser/web_discovery_tab_helper.cc index 98c120e2be25..023e260fe501 100644 --- a/components/web_discovery/browser/web_discovery_tab_helper.cc +++ b/components/web_discovery/browser/web_discovery_tab_helper.cc @@ -26,7 +26,7 @@ void WebDiscoveryTabHelper::DidFinishLoad( if (!render_frame_host->IsInPrimaryMainFrame()) { return; } - wdp_service_->OnFinishNavigation(url, render_frame_host); + wdp_service_->DidFinishLoad(url, render_frame_host); } WEB_CONTENTS_USER_DATA_KEY_IMPL(WebDiscoveryTabHelper); diff --git a/components/web_discovery/common/features.h b/components/web_discovery/common/features.h index 2083a40ec39c..45db0afd2555 100644 --- a/components/web_discovery/common/features.h +++ b/components/web_discovery/common/features.h @@ -10,6 +10,8 @@ namespace web_discovery::features { +// Enables the native re-implementation of the Web Discovery Project. +// If enabled, the Web Discovery component of the extension should be disabled. BASE_DECLARE_FEATURE(kWebDiscoveryNative); } // namespace web_discovery::features diff --git a/components/web_discovery/common/web_discovery.mojom b/components/web_discovery/common/web_discovery.mojom index 709d3bab3a57..94de01e15d6d 100644 --- a/components/web_discovery/common/web_discovery.mojom +++ b/components/web_discovery/common/web_discovery.mojom @@ -6,21 +6,31 @@ module web_discovery.mojom; struct SelectAttributeRequest { + // An optional selector for an element within the current selected element. + // The attribute will be retrieved from the embedded element. string? sub_selector; + // Arbitrary ID used for storing the scraped result. string key; + // Name of the attribute to scrape. string attribute; }; struct SelectRequest { + // The DOM selector for the element to scrape. string root_selector; + // Scrape requests for the selected element. array attribute_requests; }; struct AttributeResult { + // The DOM selector for the scraped element. string root_selector; + // A map of arbitrary IDs to scraped results. Value will be set to + // nullopt if the attribute was not available. map attribute_values; }; interface DocumentExtractor { + // Extracts DOM attributes from the current page in renderer. QueryElementAttributes(array requests) => (array results); }; diff --git a/components/web_discovery/renderer/blink_document_extractor.h b/components/web_discovery/renderer/blink_document_extractor.h index ce4e19c4b252..b502872395c3 100644 --- a/components/web_discovery/renderer/blink_document_extractor.h +++ b/components/web_discovery/renderer/blink_document_extractor.h @@ -16,6 +16,8 @@ namespace web_discovery { +// Extracts attributes from the current page +// for the native re-implementation of Web Discovery. class BlinkDocumentExtractor : public content::RenderFrameObserver, public mojom::DocumentExtractor { public: