Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Web Discovery reporting & double fetching #24971

Open
wants to merge 4 commits into
base: wdp-native-extraction-payload-gen
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions browser/web_discovery/web_discovery_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#include "content/public/test/web_contents_tester.h"
#include "testing/gtest/include/gtest/gtest.h"

namespace web_discovery {

class WebDiscoveryCTATest : public testing::Test {
public:
WebDiscoveryCTATest() = default;
Expand Down Expand Up @@ -172,3 +174,5 @@ TEST_F(WebDiscoveryCTATest, ShouldShowInfoBarTest) {
GetWebDiscoveryCTAIDForTesting() = "v2";
EXPECT_TRUE(ShouldShowWebDiscoveryInfoBar());
}

} // namespace web_discovery
14 changes: 14 additions & 0 deletions components/web_discovery/browser/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ component("browser") {
"credential_manager.cc",
"credential_manager.h",
"credential_signer.h",
"double_fetcher.cc",
"double_fetcher.h",
"ecdh_aes.cc",
"ecdh_aes.h",
"hash_detection.cc",
"hash_detection.h",
"hash_detection_matrix.h",
Expand All @@ -29,10 +33,16 @@ component("browser") {
"privacy_guard.h",
"regex_util.cc",
"regex_util.h",
"reporter.cc",
"reporter.h",
"request_queue.cc",
"request_queue.h",
"rsa.cc",
"rsa.h",
"server_config_loader.cc",
"server_config_loader.h",
"signature_basename.cc",
"signature_basename.h",
"util.cc",
"util.h",
"web_discovery_service.cc",
Expand Down Expand Up @@ -63,11 +73,15 @@ source_set("unit_tests") {
testonly = true
sources = [
"credential_manager_unittest.cc",
"double_fetcher_unittest.cc",
"hash_detection_unittest.cc",
"patterns_unittest.cc",
"payload_generator_unittest.cc",
"privacy_guard_unittest.cc",
"reporter_unittest.cc",
"request_queue_unittest.cc",
"server_config_loader_unittest.cc",
"signature_basename_unittest.cc",
]
deps = [
":browser",
Expand Down
132 changes: 132 additions & 0 deletions components/web_discovery/browser/double_fetcher.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/* Copyright (c) 2024 The Brave Authors. All rights reserved.
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at https://mozilla.org/MPL/2.0/. */

#include "brave/components/web_discovery/browser/double_fetcher.h"

#include <utility>

#include "brave/components/web_discovery/browser/pref_names.h"
#include "brave/components/web_discovery/browser/request_queue.h"
#include "brave/components/web_discovery/browser/util.h"
#include "components/prefs/pref_service.h"
#include "net/http/http_status_code.h"
#include "services/network/public/cpp/header_util.h"
#include "services/network/public/cpp/shared_url_loader_factory.h"
#include "services/network/public/cpp/simple_url_loader.h"
#include "services/network/public/mojom/url_response_head.mojom.h"

namespace web_discovery {

namespace {
constexpr char kUrlKey[] = "url";
constexpr char kAssociatedDataKey[] = "assoc_data";

constexpr base::TimeDelta kRequestMaxAge = base::Hours(1);
constexpr base::TimeDelta kMinRequestInterval =
base::Minutes(1) - base::Seconds(5);
constexpr base::TimeDelta kMaxRequestInterval =
base::Minutes(1) + base::Seconds(5);
constexpr size_t kMaxRetries = 3;
constexpr size_t kMaxDoubleFetchResponseSize = 2 * 1024 * 1024;

constexpr net::NetworkTrafficAnnotationTag kFetchNetworkTrafficAnnotation =
net::DefineNetworkTrafficAnnotation("wdp_doublefetch", R"(
semantics {
sender: "Brave Web Discovery Double Fetch"
description:
"Retrieves a page of interest without cookies for
scraping and reporting via Web Discovery."
trigger:
"Requests are sent minutes after the original
page request is made by the user."
data: "Page data"
destination: WEBSITE
}
policy {
cookies_allowed: NO
setting:
"Users can opt-in or out via brave://settings/search"
})");

} // namespace

DoubleFetcher::DoubleFetcher(
PrefService* profile_prefs,
network::SharedURLLoaderFactory* shared_url_loader_factory,
FetchedCallback callback)
: profile_prefs_(profile_prefs),
shared_url_loader_factory_(shared_url_loader_factory),
request_queue_(profile_prefs,
kScheduledDoubleFetches,
kRequestMaxAge,
kMinRequestInterval,
kMaxRequestInterval,
kMaxRetries,
base::BindRepeating(&DoubleFetcher::OnFetchTimer,
base::Unretained(this))),
callback_(callback) {}

DoubleFetcher::~DoubleFetcher() = default;

void DoubleFetcher::ScheduleDoubleFetch(const GURL& url,
base::Value associated_data) {
base::Value::Dict fetch_dict;
fetch_dict.Set(kUrlKey, url.spec());
fetch_dict.Set(kAssociatedDataKey, std::move(associated_data));

request_queue_.ScheduleRequest(std::move(fetch_dict));
}

void DoubleFetcher::OnFetchTimer(const base::Value& request_data) {
const auto* fetch_dict = request_data.GetIfDict();
const auto* url_str = fetch_dict ? fetch_dict->FindString(kUrlKey) : nullptr;
if (!url_str) {
request_queue_.NotifyRequestComplete(true);
return;
}

GURL url(*url_str);
auto resource_request = CreateResourceRequest(url);
url_loader_ = network::SimpleURLLoader::Create(
std::move(resource_request), kFetchNetworkTrafficAnnotation);
url_loader_->DownloadToString(
shared_url_loader_factory_.get(),
base::BindOnce(&DoubleFetcher::OnRequestComplete, base::Unretained(this),
url),
kMaxDoubleFetchResponseSize);
}

void DoubleFetcher::OnRequestComplete(
GURL url,
std::optional<std::string> response_body) {
bool result = false;
auto* response_info = url_loader_->ResponseInfo();
if (response_info) {
auto response_code = response_info->headers->response_code();
if (!network::IsSuccessfulStatus(response_code)) {
if (response_code >= net::HttpStatusCode::HTTP_BAD_REQUEST &&
response_code < net::HttpStatusCode::HTTP_INTERNAL_SERVER_ERROR) {
// Only retry failures due to server error
// Mark as 'successful' if not a 5xx error, so we don't retry
result = true;
}
response_body = std::nullopt;
} else {
result = true;
}
}

auto request_data = request_queue_.NotifyRequestComplete(result);

if (request_data) {
const auto& request_dict = request_data->GetDict();
const auto* assoc_data = request_dict.Find(kAssociatedDataKey);
if (assoc_data) {
callback_.Run(url, *assoc_data, response_body);
}
}
}

} // namespace web_discovery
66 changes: 66 additions & 0 deletions components/web_discovery/browser/double_fetcher.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/* Copyright (c) 2024 The Brave Authors. All rights reserved.
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at https://mozilla.org/MPL/2.0/. */

#ifndef BRAVE_COMPONENTS_WEB_DISCOVERY_BROWSER_DOUBLE_FETCHER_H_
#define BRAVE_COMPONENTS_WEB_DISCOVERY_BROWSER_DOUBLE_FETCHER_H_

#include <memory>
#include <optional>
#include <string>

#include "base/memory/raw_ptr.h"
#include "base/values.h"
#include "brave/components/web_discovery/browser/request_queue.h"
#include "url/gurl.h"

class PrefService;

namespace network {
class SharedURLLoaderFactory;
class SimpleURLLoader;
} // namespace network

namespace web_discovery {

// Makes anonymous requests to relevant page URLs, without involvement of the
// user's session. In the case of search engine result pages, the result of the
// double fetch will scraped for search engine results for a future submission.
// Uses `RequestQueue` to persist and schedule double fetches. Requests
// will be sent on somewhat random intervals averaging to a minute.
class DoubleFetcher {
public:
using FetchedCallback =
base::RepeatingCallback<void(const GURL& url,
const base::Value& associated_data,
std::optional<std::string> response_body)>;
DoubleFetcher(PrefService* profile_prefs,
network::SharedURLLoaderFactory* shared_url_loader_factory,
FetchedCallback callback);
~DoubleFetcher();

DoubleFetcher(const DoubleFetcher&) = delete;
DoubleFetcher& operator=(const DoubleFetcher&) = delete;

// Queues a double fetch for a given URL. The associated data will be stored
// beside the queue request, and will be passed to the `FetchedCallback`
// upon completion.
void ScheduleDoubleFetch(const GURL& url, base::Value associated_data);

private:
void OnFetchTimer(const base::Value& request_data);
void OnRequestComplete(GURL url, std::optional<std::string> response_body);

raw_ptr<PrefService> profile_prefs_;
raw_ptr<network::SharedURLLoaderFactory> shared_url_loader_factory_;
DJAndries marked this conversation as resolved.
Show resolved Hide resolved
std::unique_ptr<network::SimpleURLLoader> url_loader_;

RequestQueue request_queue_;

FetchedCallback callback_;
};

} // namespace web_discovery

#endif // BRAVE_COMPONENTS_WEB_DISCOVERY_BROWSER_DOUBLE_FETCHER_H_
Loading
Loading