Skip to content

Commit

Permalink
Add comments for Web Discovery
Browse files Browse the repository at this point in the history
  • Loading branch information
DJAndries committed Jun 27, 2024
1 parent fbb2577 commit 3cfb538
Show file tree
Hide file tree
Showing 21 changed files with 230 additions and 10 deletions.
15 changes: 15 additions & 0 deletions components/web_discovery/browser/content_scraper.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ struct PageScrapeResult {
static std::unique_ptr<PageScrapeResult> FromValue(const base::Value& dict);

GURL url;
// A map of DOM selectors to list of scraped values embedded in a Dict.
// Each dict contains arbitrary keys (defined in the patterns) to scraped
// values.
base::flat_map<std::string, std::vector<base::Value::Dict>> fields;
std::string id;

Expand All @@ -41,6 +44,18 @@ struct PageScrapeResult {
std::optional<std::string> query;
};

// Extracts attribute values from the page DOM for reporting purposes.
// ContentScraper utilizes the following techniques:
//
// a) Extraction within the current page in the renderer (via `ScrapePage`).
// The `mojom::DocumentExtractor` is used to request attribute values
// from the current DOM in the view. Typically, this is used to exact a
// search query, and decide whether the page is worthy of investigation
// and reporting.
// b) Parsing and extracting HTML from a double fetch. This follows
// the extraction in a). Used to extract all other needed details
// from the page i.e. search results. Uses a Rust library for DOM
// operations, in respect of Rule of Two.
class ContentScraper {
public:
using PageScrapeResultCallback =
Expand Down
15 changes: 15 additions & 0 deletions components/web_discovery/browser/credential_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@ struct GenerateJoinRequestResult {
std::string signature;
};

// Manages and utilizes anonymous credentials used for communicating
// with Web Discovery servers. These Direct Anonymous Attestation credentials
// are used to prevent Sybil attacks on the servers.
// The manager provides two key functions:
//
// a) "joining": acquires credentials from the Web Discovery server. Join
// requests
// are signed with a random RSA key that is persisted with the profile.
// b) "signing": uses the previously acquired credentials to sign submissions
// which is required in order for the servers to accept the request.
class CredentialManager : public CredentialSigner {
public:
CredentialManager(PrefService* profile_prefs,
Expand All @@ -45,6 +55,8 @@ class CredentialManager : public CredentialSigner {
CredentialManager(const CredentialManager&) = delete;
CredentialManager& operator=(const CredentialManager&) = delete;

// Acquires credentials for all dates/"group public keys" published in
// the server config, if not stored already.
void JoinGroups();

// CredentialSigner:
Expand All @@ -54,6 +66,9 @@ class CredentialManager : public CredentialSigner {
std::vector<const uint8_t> basename,
SignCallback callback) override;

// Uses a fixed seed in the anonymous credential manager
// to provide deterministic credentials & signatures which
// are useful for testing.
void UseFixedSeedForTesting();

private:
Expand Down
9 changes: 9 additions & 0 deletions components/web_discovery/browser/credential_signer.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,16 @@ class CredentialSigner {
base::OnceCallback<void(std::optional<std::vector<const uint8_t>>)>;
virtual ~CredentialSigner();

// Returns true is a credential is available for the current date.
// The caller can expect future calls to `Sign` to succeed, if made today.
virtual bool CredentialExistsForToday() = 0;

// Signs a message for a given basename. The server has the ability
// to check whether two messages with the same basename were signed
// with the same credential without revealing the credential used,
// preventing Sybil attacks.
// See signature_basename.h/cc for more information on how the basename
// should be generated.
virtual bool Sign(std::vector<const uint8_t> msg,
std::vector<const uint8_t> basename,
SignCallback callback) = 0;
Expand Down
12 changes: 12 additions & 0 deletions components/web_discovery/browser/document_extractor/rs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,39 @@ use kuchikiki::{
#[cxx::bridge(namespace = "rust_document_extractor")]
mod ffi {
pub struct SelectAttributeRequest {
/// An optional selector for an element within the current selected element.
/// The attribute will be retrieved from the embedded element.
/// If not needed, an empty string should be provided.
pub sub_selector: String,
/// Arbitrary ID used for storing the scraped result.
pub key: String,
/// Name of the attribute to scrape.
pub attribute: String,
}

pub struct SelectRequest {
/// The DOM selector for the element to scrape.
pub root_selector: String,
/// Scrape requests for the selected element.
pub attribute_requests: Vec<SelectAttributeRequest>,
}

pub struct AttributePair {
/// Arbitrary ID for the scraped result.
pub key: String,
/// The scraped value. Will be empty if attribute is not available.
pub value: String,
}

pub struct AttributeResult {
/// The DOM selector for the scraped element.
pub root_selector: String,
/// A list of arbitrary IDs and scraped value pairs.
pub attribute_pairs: Vec<AttributePair>,
}

extern "Rust" {
/// Extracts DOM attributes from the result of a double fetch.
fn query_element_attributes(
html: &CxxString,
requests: &CxxVector<SelectRequest>,
Expand Down
8 changes: 8 additions & 0 deletions components/web_discovery/browser/double_fetcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ class SimpleURLLoader;

namespace web_discovery {

// Makes anonymous requests to relevant page URLs, without involvement of the
// user's session. In the case of search engine result pages, the result of the
// double fetch will scraped for search engine results for a future submission.
// Uses `RequestQueue` to persist and schedule double fetches. Requests
// will be sent on somewhat random intervals averaging to a minute.
class DoubleFetcher {
public:
using FetchedCallback =
Expand All @@ -38,6 +43,9 @@ class DoubleFetcher {
DoubleFetcher(const DoubleFetcher&) = delete;
DoubleFetcher& operator=(const DoubleFetcher&) = delete;

// Queues a double fetch for a given URL. The associated data will be stored
// beside the queue request, and will be passed to the `FetchedCallback`
// upon completion.
void ScheduleDoubleFetch(const GURL& url, base::Value associated_data);

private:
Expand Down
3 changes: 3 additions & 0 deletions components/web_discovery/browser/hash_detection.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@

namespace web_discovery {

// Uses a pre-trained Markov chain classifier to detect the likelihood
// of a hash in a given piece of text. Used in privacy guard functions
// for detecting potentially private URLs/queries.
bool IsHashLikely(RegexUtil& regex_util,
std::string value,
double probability_multiplier = 1.0);
Expand Down
75 changes: 72 additions & 3 deletions components/web_discovery/browser/patterns.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,83 +21,152 @@ class RE2;

namespace web_discovery {

enum class ScrapeRuleType { kStandard, kSearchQuery, kWidgetTitle, kOther };
enum class PayloadRuleType { kQuery, kSingle };
enum class PayloadResultType { kSingle, kClustered, kCustom };
enum class ScrapeRuleType {
// Will retrieve a value not defined in the DOM, such as the client country
// code or the current url.
kStandard,
// If the following two types are used for a rule, the value will be marked
// as the search query, which will be used for privacy checks.
kSearchQuery,
kWidgetTitle,
// All other rules should have this type. No special processing will be
// performed.
kOther
};
enum class PayloadRuleType {
// Coupled with the `kClustered` result type.
// All instances of a given attribute will be grouped into a single payload.
kQuery,
// Coupled with the `kSingle` result type.
// Each instance of a given attribute will have its own payload.
kSingle
};
enum class PayloadResultType {
// Coupled with the `kSingle` rule type.
kSingle,
// Coupled with the `kClustered` rule type.
kClustered,
// Currently unsupported/ignored.
kCustom
};

// Contains functions for refining the scraped value. The inner vector
// contains the function name and arguments for the function.
using RefineFunctionList = std::vector<std::vector<base::Value>>;

// Defines rule for scraping an attribute from a given selected element.
struct ScrapeRule {
ScrapeRule();
~ScrapeRule();

ScrapeRule(const ScrapeRule&) = delete;
ScrapeRule& operator=(const ScrapeRule&) = delete;

// An optional selector for an element within the current selected element.
// The attribute will be retrieved from the embedded element.
std::optional<std::string> sub_selector;
ScrapeRuleType rule_type;
// The name of the attribute to retrieve for a DOM element.
std::string attribute;
// Functions used to refine the retrieved value. See the "func ids" defined
// in content_scraper.cc for all possible functions.
RefineFunctionList functions_applied;
};

// A map of keys (arbitrary IDs used for storing the scraped result) to scrape
// rules.
using ScrapeRuleGroup =
base::flat_map<std::string, std::unique_ptr<ScrapeRule>>;

// A rule for provided a single key/value pair within the submission payload.
struct PayloadRule {
PayloadRule();
~PayloadRule();

PayloadRule(const PayloadRule&) = delete;
PayloadRule& operator=(const PayloadRule&) = delete;

// The DOM selector of the scraped attribute.
std::string selector;
// The arbitrary key associated with the scraped value.
std::string key;
// If set to true, an array-like Dict (each dict key is an index)
// will be rendered.
// Each value in the Dict will be a Dict containing all keys/values
// associated with the selector. This is commonly used for listing search
// results.
bool is_join = false;
};

// Contains rules for generating a payload for submission.
struct PayloadRuleGroup {
PayloadRuleGroup();
~PayloadRuleGroup();

PayloadRuleGroup(const PayloadRuleGroup&) = delete;
PayloadRuleGroup& operator=(const PayloadRuleGroup&) = delete;

// An arbitrary ID for the rule group. Current, this isn't used in the
// payload.
std::string key;
PayloadRuleType rule_type;
PayloadResultType result_type;
// The name of the "action" for the given payload.
std::string action;
// The rules for generating the fields within the payload.
std::vector<PayloadRule> rules;
};

// Contains settings and rule groups associated with a particular URL.
struct PatternsURLDetails {
PatternsURLDetails();
~PatternsURLDetails();

PatternsURLDetails(const PatternsURLDetails&) = delete;
PatternsURLDetails& operator=(const PatternsURLDetails&) = delete;

// The regex used to match the URL in the address bar.
std::unique_ptr<re2::RE2> url_regex;
bool is_search_engine;
// The two or three-letter arbitrary id associated with the site.
std::string id;
// The search path prefix used for constructing private search queries
// for double fetching.
std::optional<std::string> search_template_prefix;
// The scraping rules for the site. A map of DOM selectors
// to rule groups.
base::flat_map<std::string, ScrapeRuleGroup> scrape_rule_groups;
// The payload generation rules used for generating submissions
// from scraped attributes.
std::vector<PayloadRuleGroup> payload_rule_groups;
};

// The full "patterns" configuration provided by the Web Discovery server.
// The configuration provides rules for scraping certain pages.
struct PatternsGroup {
PatternsGroup();
~PatternsGroup();

PatternsGroup(const PatternsGroup&) = delete;
PatternsGroup& operator=(const PatternsGroup&) = delete;

// Checks URL against all URL regexes in either the "normal" or "strict" set,
// and returns the URL details/rules if available.
const PatternsURLDetails* GetMatchingURLPattern(const GURL& url,
bool is_strict_scrape) const;

// A list of URLs and rules used for scraping pages in the renderer,
// pre-"double fetch". These rules typically scrape simple attributes which
// are used to determine whether a page is private (i.e. the search query).
std::vector<PatternsURLDetails> normal_patterns;
// A list of URLS and rules used for scraping contents from a "double fetch".
// The rules are usually more involved than the "normal" rules. In the case of
// search engine result pages, the rules will be used to retrieve the
// search results and any other relevant details.
std::vector<PatternsURLDetails> strict_patterns;
};

// Returns nullptr if parsing fails.
std::unique_ptr<PatternsGroup> ParsePatterns(const std::string& patterns_json);

} // namespace web_discovery
Expand Down
4 changes: 4 additions & 0 deletions components/web_discovery/browser/payload_generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,16 @@ namespace web_discovery {
inline constexpr char kActionKey[] = "action";
inline constexpr char kInnerPayloadKey[] = "payload";

// Generates "query" messages using the payload generation rules
// and scraped data for a given site.
std::vector<base::Value::Dict> GenerateQueryPayloads(
const ServerConfig& server_config,
RegexUtil& regex_util,
const PatternsURLDetails* url_details,
std::unique_ptr<PageScrapeResult> scrape_result);

// Generates an "alive" message to indicate an opted-in
// status to the server.
base::Value::Dict GenerateAlivePayload(const ServerConfig& server_config,
std::string date_hour);

Expand Down
11 changes: 11 additions & 0 deletions components/web_discovery/browser/privacy_guard.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,29 @@

namespace web_discovery {

// Checks if a URL is likely to be private based on various criteria.
// If true, the page should not be investigated or reported.
bool IsPrivateURLLikely(RegexUtil& regex_util,
const GURL& url,
const PatternsURLDetails* matching_url_details);

// Determines if a search query is likely to contain private information.
// If true, the search query should not be investigated or reported.
bool IsPrivateQueryLikely(RegexUtil& regex_util, const std::string& query);

// Generates a simple search URL (without additional query parameters)
// based on the original search URL and query. Used for the double fetch
// to ensure that the user's profile is not involved in the query.
GURL GeneratePrivateSearchURL(const GURL& original_url,
const std::string& query,
const PatternsURLDetails& matching_url_details);

// Checks if a URL should be dropped due to its length or content.
// Currently only used for determining whether to mask a URL
// in the function below.
bool ShouldDropLongURL(RegexUtil& regex_util, const GURL& url);

// Masks a URL to protect privacy. Returns nullopt if URL is invalid.
std::optional<std::string> MaskURL(RegexUtil& regex_util, const GURL& url);

} // namespace web_discovery
Expand Down
2 changes: 2 additions & 0 deletions components/web_discovery/browser/regex_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

namespace web_discovery {

// Lazily creates and caches pre-compiled regexes, mainly used for
// privacy risk assessment of page URLs/contents.
class RegexUtil {
public:
RegexUtil();
Expand Down
10 changes: 10 additions & 0 deletions components/web_discovery/browser/reporter.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ class SimpleURLLoader;

namespace web_discovery {

// Handles all functions required for reporting generated payloads:
// - zlib compression
// - ECDH key derivation + key exchange
// - AES encryption (to prevent eavesdropping by the server proxy)
// - signing the request using anonymous credentials from the
// `CredentialManager` (to prevent Sybil attacks on the server)
// - performing the request for submission
// Uses `RequestQueue` to persist and schedule submissions. Reports
// will be processed on somewhat random intervals averaging to a minute.
class Reporter {
public:
Reporter(PrefService* profile_prefs,
Expand All @@ -41,6 +50,7 @@ class Reporter {
Reporter(const Reporter&) = delete;
Reporter& operator=(const Reporter&) = delete;

// Schedule a generated payload for submission.
void ScheduleSend(base::Value::Dict payload);

private:
Expand Down
Loading

0 comments on commit 3cfb538

Please sign in to comment.