Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nanobenchmarks #4143

Merged
merged 22 commits into from
Dec 29, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 38 additions & 29 deletions documentation/bibliography.bib
Original file line number Diff line number Diff line change
Expand Up @@ -945,7 +945,7 @@ @article{Kutta1901
url = {https://archive.org/details/zeitschriftfrma12runggoog/page/435},
date = {1901-11},
journaltitle = {Zeitschrift für Mathematik und Physik},
pages = {435453},
pages = {435--453},
title = {Beitrag zur näherungsweisen Integration totaler Differentialgleichungen},
volume = {46},
}
Expand Down Expand Up @@ -1170,8 +1170,8 @@ @article{Newhall1989
volume = {45},
}

@article{NguyễnStehlé2009,
author = {Nguyễn, Phong Q. and Stehlé, Damien},
@article{NguyễnStehlé2009,
author = {Nguyễn, Phong Q. and Stehlé, Damien},
url = {https://doi.org/10.1137/070705702},
date = {2009},
doi = {10.1137/070705702},
Expand Down Expand Up @@ -1728,7 +1728,7 @@ @book{Meeus1998
}

@book{MullerBrisebarreDeDinechinJeannerodLefevreMelquiondRevolStehleTorres2010,
author = {Muller, Jean-Michel and Brisebarre, Nicolas and De Dinechin, Florent and Jeannerod, Claude-Pierre and Lefèvre, Vincent and Melquiond, Guillaume and Revol, Nathalie and Stehlé, Damien and Torres, Serge},
author = {Muller, Jean-Michel and Brisebarre, Nicolas and De Dinechin, Florent and Jeannerod, Claude-Pierre and Lefèvre, Vincent and Melquiond, Guillaume and Revol, Nathalie and Stehlé, Damien and Torres, Serge},
publisher = {Birkhäuser},
date = {2010},
isbn = {978-0-8176-4704-9},
Expand All @@ -1752,7 +1752,7 @@ @book{NistHMF2010
}

@book{NocedalWright2006,
author = {Nocedal, Jorge, and Wright, Stephen J.},
author = {Nocedal, Jorge and Wright, Stephen J.},
publisher = {Springer},
date = {2006},
isbn = {978-0387-30303-1},
Expand Down Expand Up @@ -1869,7 +1869,7 @@ @inbook{ZatloukalJohnsonLadner2002
booktitle = {Near Neighbor Searches, and Methodology: Fifth and Sixth DIMACS Implementation Challenges},
date = {2002},
isbn = {0821828924},
pages = {69-86},
pages = {69--86},
title = {Nearest Neighbor Search for Data Compression},
}

Expand Down Expand Up @@ -2074,32 +2074,32 @@ @inproceedings{SofroniouSpaletta2002
venue = {Amsterdam, The Netherlands},
}

@inproceedings{StehléZimmermann2005,
author = {Stehlé, Damien and Zimmermann, Paul},
editor = {Montuschi, Paolo and Schwarz, Eric},
publisher = {IEEE Computer Society},
booktitle = {17th IEEE Symposium on Computer Arithmetic (ARITH'05)},
date = {2005-06},
doi = {10.1109/ARITH.2005.24},
eventdate = {2005-06-27/2005-06-29},
isbn = {0-7695-2366-8},
pages = {257--264},
title = {Gal's accurate tables method revisited},
venue = {Cape Cod, MA, USA},
@inproceedings{StehléZimmermann2005,
author = {Stehlé, Damien and Zimmermann, Paul},
editor = {Montuschi, Paolo and Schwarz, Eric},
publisher = {IEEE Computer Society},
booktitle = {17th IEEE Symposium on Computer Arithmetic (ARITH'05)},
date = {2005-06},
doi = {10.1109/ARITH.2005.24},
eventdate = {2005-06-27/2005-06-29},
isbn = {0-7695-2366-8},
pages = {257--264},
title = {Gal's accurate tables method revisited},
venue = {Cape Cod, MA, USA},
}

@inproceedings{WuZhang1991,
author = {Wu, Xiaolin and Zhang, Kaizhong},
editor = {Storer, James A. and Reif, John H.},
publisher = {IEEE Computer Society},
booktitle = {1991 Data Compression Conference},
date = {1991-04},
doi = {10.1109/DCC.1991.213341},
eventdate = {1991-04-08/1991-04-11},
isbn = {0-8186-9202-2},
pages = {392-401},
title = {A better tree-structured vector quantizer},
venue = {Snowbird, UT, USA},
author = {Wu, Xiaolin and Zhang, Kaizhong},
editor = {Storer, James A. and Reif, John H.},
publisher = {IEEE Computer Society},
booktitle = {1991 Data Compression Conference},
date = {1991-04},
doi = {10.1109/DCC.1991.213341},
eventdate = {1991-04-08/1991-04-11},
isbn = {0-8186-9202-2},
pages = {392--401},
title = {A better tree-structured vector quantizer},
venue = {Snowbird, UT, USA},
}

@mvbook{Fontenelle1758,
Expand Down Expand Up @@ -2224,6 +2224,15 @@ @report{LongCappellariVelezFuchs
type = {techreport},
}

@report{Paoloni2010,
author = {Paoloni, Gabriele},
institution = {Intel Corporation},
date = {2010-09},
number = {324264-001},
title = {How to Benchmark Code Execution Times on Intel® IA-32 and IA-64 Instruction Set Architectures},
type = {White Paper},
}

@report{RiesBettadpurEanesKangKoMcCulloughNagelPiePooleRichterSaveTapley2016,
author = {Ries, J. and Bettadpur, S. and Eanes, R. and Kang, Z. and Ko, U. and McCullough, C. and Nagel, P. and Pie, N. and Poole, S. and Save, H. and Tapley, B.},
institution = {Center for Space Research at the University of Texas at Austin},
Expand Down
Binary file modified documentation/bibliography.pdf
Binary file not shown.
44 changes: 30 additions & 14 deletions nanobenchmarks/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ struct LatencyDistributionTable {
double min;
std::vector<double> quantiles;

static std::string const& heading() {
static std::string const& Heading() {
static std::string const& result = [] {
std::stringstream& out = *new std::stringstream();
std::print(out, "{:>8}", "min");
Expand Down Expand Up @@ -118,40 +118,56 @@ struct LatencyDistributionTable {
}
};

LatencyDistributionTable operator*(double a, LatencyDistributionTable x) {
LatencyDistributionTable operator*(double const a,
LatencyDistributionTable const& x) {
LatencyDistributionTable result{a * x.min};
for (double const quantile : x.quantiles) {
result.quantiles.push_back(a * quantile);
}
return result;
}

LatencyDistributionTable operator+(LatencyDistributionTable x, double b) {
LatencyDistributionTable operator+(LatencyDistributionTable const& x,
double const b) {
LatencyDistributionTable result{x.min + b};
for (double const quantile : x.quantiles) {
result.quantiles.push_back(quantile + b);
}
return result;
}

// We disable inlining on this function so that the overhead is independent of
// the callsite, and so that we actually call the benchmarked function via a
// function pointer, instead of inlining it.
__declspec(noinline) LatencyDistributionTable
eggrobin marked this conversation as resolved.
Show resolved Hide resolved
Benchmark(BenchmarkedFunction f, Logger* logger) {
Benchmark(BenchmarkedFunction const f, Logger* logger) {
std::size_t const sample_count = absl::GetFlag(FLAGS_samples);
std::size_t const loop_iterations = absl::GetFlag(FLAGS_loop_iterations);
static std::vector<double>& samples = *new std::vector<double>(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use a pointer, not a reference, to avoid depending on lifetime extension.

Copy link
Member Author

@eggrobin eggrobin Dec 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lifetime is obvious : *new, we leak.
Let’s not litter the code with (*) below.

sample_count, std::numeric_limits<double>::quiet_NaN());
int registers[4]{};
int leaf = 0;
for (int j = 0; j < sample_count; ++j) {
double x = absl::GetFlag(FLAGS_input);
double const input = absl::GetFlag(FLAGS_input);
double x = input;
// The CPUID barriers prevent out-of-order execution; see [Pao10].
__cpuid(registers, leaf);
auto const tsc = __rdtsc();
auto const tsc_start = __rdtsc();
for (int i = 0; i < loop_iterations; ++i) {
x = f(x);
x += 5 - x;
x += input - x;
}
unsigned int tsc_aux;
// The use of RDTSCP rather than RDTSC here follows [Pao10]. See the Intel®
// 64 and IA-32 Architectures Software Developer’s Manual:
// The RDTSCP instruction is not a serializing instruction, but it does
// wait until all previous instructions have executed and all previous loads
// are globally visible. But it does not wait for previous stores to be
// globally visible, and subsequent instructions may begin execution before
// the read operation is performed.
auto const tsc_stop = __rdtscp(&tsc_aux);
__cpuid(registers, leaf);
eggrobin marked this conversation as resolved.
Show resolved Hide resolved
double const δtsc = __rdtsc() - tsc;
double const δtsc = tsc_stop - tsc_start;
samples[j] = δtsc / loop_iterations;
}
if (logger != nullptr) {
Expand All @@ -168,10 +184,10 @@ __declspec(noinline) LatencyDistributionTable
return result;
}

std::size_t FormattedWidth(std::string s) {
std::size_t FormattedWidth(std::string const& s) {
// Two columns per code unit is wide enough, since field width is at most 2
// per extended grapheme cluster.
std::size_t wide = 2 * s.size();
std::size_t const wide = 2 * s.size();
// There is no vformatted_size, so we actually format.
std::size_t const formatted_size =
std::vformat("{:" + std::to_string(wide) + "}",
Expand All @@ -184,7 +200,7 @@ std::size_t FormattedWidth(std::string s) {

void Main() {
std::regex const name_matcher(absl::GetFlag(FLAGS_benchmark_filter));
auto controller = PerformanceSettingsController::Make();
auto controller = PerformanceSettingsController::New();
std::unique_ptr<Logger> logger;
std::string const& filename = absl::GetFlag(FLAGS_log_to_mathematica);
if (!filename.empty()) {
Expand All @@ -202,13 +218,13 @@ void Main() {
ReferenceCycleCounts().contains(f);
}) |
std::views::keys | std::views::transform(&FormattedWidth);
std::size_t name_width = *std::ranges::max_element(name_widths);
std::size_t const name_width = *std::ranges::max_element(name_widths);
std::map<BenchmarkedFunction, LatencyDistributionTable>
reference_measurements;
std::vprint_unicode(
"{:<" + std::to_string(name_width + 2) + "}{:8}{}\n",
std::make_format_args(
"RAW TSC:", "", LatencyDistributionTable::heading()));
"RAW TSC:", "", LatencyDistributionTable::Heading()));
for (auto const& [function, _] : ReferenceCycleCounts()) {
auto const result = Benchmark(function, logger.get());
reference_measurements.emplace(function, result);
Expand Down Expand Up @@ -237,7 +253,7 @@ void Main() {
std::vprint_unicode(
"{:<" + std::to_string(name_width + 2) + "}{:>8}{}\n",
std::make_format_args(
"Cycles:", "expected", LatencyDistributionTable::heading()));
"Cycles:", "expected", LatencyDistributionTable::Heading()));
for (auto const& [name, f] :
FunctionRegistry::functions_by_name()) {
if (!std::regex_match(name, name_matcher) &&
Expand Down
Loading
Loading