polish

Quafadas · Sep 27, 2024 · 0ad9c1c · 0ad9c1c
1 parent 15bdfd3
commit 0ad9c1c
Show file tree

Hide file tree

Showing 19 changed files with 167 additions and 175 deletions.
diff --git a/benchmark/src/vectorAddition.scala b/benchmark/src/vectorAddition.scala
@@ -28,6 +28,7 @@ package vecxt.benchmark
 import org.openjdk.jmh.annotations.*
 import org.openjdk.jmh.infra.Blackhole
 import vecxt.Matrix.*
+import jdk.incubator.vector.DoubleVector
 import vecxt.*
 import scala.compiletime.uninitialized
 
@@ -61,26 +62,34 @@ class AddScalarBenchmark extends BLASBenchmark:
   end setup
 
   extension (vec: Array[Double])
-    inline def scalarPlus(d: Double): Unit =
-      var i = 0
+    inline def scalarPlusVec(d: Double): Unit =
+      val species = DoubleVector.SPECIES_PREFERRED
+      var i: Int = 0
+      val l = species.length()
+
+      while i < species.loopBound(vec.length) do
+        DoubleVector.fromArray(species, vec, i).add(DoubleVector.broadcast(species, d)).intoArray(vec, i)
+        i += l
+      end while
+
       while i < vec.length do
         vec(i) += d
         i += 1
       end while
-    end scalarPlus
+    end scalarPlusVec
 
 
   end extension
 
   @Benchmark
   def vecxt_add(bh: Blackhole) =
-    vec.scalarPlus(4.5)
+    vec.scalarPlusVec(4.5)
     bh.consume(vec);
   end vecxt_add
 
   @Benchmark
   def vecxt_add_vec(bh: Blackhole) =
-    vec2 +:+= 4.5
+    vec2.scalarPlusVec(4.5)
     bh.consume(vec2);
   end vecxt_add_vec
 end AddScalarBenchmark

diff --git a/build.sc b/build.sc
@@ -1,6 +1,6 @@
 
 import $ivy.`com.github.lolgab::mill-crossplatform::0.2.4`
-import $ivy.`io.github.quafadas::millSite::0.0.30`
+import $ivy.`io.github.quafadas::millSite::0.0.31-DIRTY1d54e88b`
 import $ivy.`de.tototec::de.tobiasroeser.mill.vcs.version::0.4.0`
 import $ivy.`com.lihaoyi::mill-contrib-jmh:`
 
@@ -187,6 +187,8 @@ object site extends SiteModule {
 
   override val jsSiteModule = jsSite
 
+  override def pathToImportMap = Some(PathRef(T.workspace / "importmap.json"))
+
   override def forkArgs: T[Seq[String]] = super.forkArgs() ++ vecIncubatorFlag
 
   def scalaVersion = vecxt.jvm.scalaVersion

diff --git a/importmap.json b/importmap.json
@@ -0,0 +1,7 @@
+{
+  "imports": {
+    "##vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6/+esm?bundle-deps=true",
+    "##vega-view": "https://cdn.jsdelivr.net/npm/vega-view@5/+esm?bundle-deps=true",
+    "@stdlib/blas/base": "https://cdn.jsdelivr.net/npm/@stdlib/blas@0.2.0/base/+esm"
+  }
+}
diff --git a/justfile b/justfile
@@ -7,7 +7,7 @@ benchmark:
   mill benchmark.runJmh -jvmArgs --add-modules=jdk.incubator.vector -rf json
 
 benchmarkOnly:
-  mill benchmark.runJmh -jvmArgs --add-modules=jdk.incubator.vector -rf json vecxt.benchmark.LogicalBenchmark
+  mill benchmark.runJmh -jvmArgs --add-modules=jdk.incubator.vector -rf json vecxt.benchmark.AddScalarBenchmark
 
 setJvm:
   eval "$(cs java --jvm 21 --env)"
diff --git a/site/docs/_blog/_posts/2024-08-01-Motivation.md b/site/docs/_blog/_posts/2024-08-01-Motivation.md
@@ -17,7 +17,7 @@ For example if your data acquisition is serverside, but do parts of a calculatio
 
 # JVM
 
-[[vecxt]] is cross platform, this example runs on the JVM, see [Cross Platform](js.mdoc.md) for the same example running in scalaJS.
+[[vecxt]] is cross platform, this example runs on the JVM, see [Cross Platform](/js.mdoc.md) for the same example running in scalaJS.
 
 ```scala mdoc
 

diff --git a/site/docs/_blog/_posts/2024-09-19-Benchmarking.md b/site/docs/_blog/_posts/2024-09-19-Benchmarking.md
@@ -17,26 +17,35 @@ From first principles, I thought about 3 problems;
 
 ## Trigger
 
-I concluded I wanted a manual trigger. Each push to main is too much - a lot of redundant data and duplicated work on each commit. In future perhaps switch to benchmark on  release. As the suite is being built out, I think manual makes more sense.
+I concluded I wanted a manual trigger. Each push to main is too much. Benchmareking is by definition compute intense, and doing it continuously is rather wasteful outside of something truly mission critical. In future perhaps switch to benchmark on  release. Currently, manual makes more sense.
 
-## Where shoudl the data be stored
+## Where should the data be stored
 
 JMH itself doesn't give you more than the results. We need extra metadata. ChatGPT wrote me a shell script which appends the date, commit and branch that the results were generated from into the results html. This will be our metadata.
 
 This file is then pushed to an orphaned  [branch](https://github.com/Quafadas/vecxt/tree/benchmark). The benchmarking process ends here - it only stores the data, it's then left to the consumer to figure out what to do with it.
 
 ## Consumption
 
-The goal is that we will then hook into our orphan branch in the github action and aggregate the results into a single benchmarking.json file. This file will be added to the static assets of the site.
+During the github pages build step (i.e. in GHA) This file will be added to the static assets of the site, we switch into our orphan branch in the github action and aggregate the results into a single benchmarking.json file, we further post process data data from the step above to flatten all results into a single array.
 
-Our benchmark plots may then reference and polish that data  it is, that they want to plot.
+It may be found [here](../../benchmarks/benchmark_history.json);
 
-# Conclusion
+Now we're in a position to plot some benchmarks.
 
-That there is a bug in scaladoc.
+```javascript
+<div id="vis" style="width: 50vw;height: 10vh"></div>
 
-https://github.com/scala/scala3/issues/21637
+<script type="module">
+  import vegaEmbed from "https://cdn.jsdelivr.net/npm/vega-embed@6/+esm?bundle-deps=true";
+  var spec = "../../plots/addScalar.vg.json";
+  vegaEmbed('#vis', spec).then(function(result) {
+    // Access the Vega view instance (https://vega.github.io/vega/docs/api/view/) as result.view
+  }).catch(console.error);
+</script>
+```
+
+This javascript embeds a vega plot into the page. Now, we need only to provide a plotfile per visualisation. Ideally, we would abstract over the visualisations we need - my pet project dedav4s would be great for this, but scalaJS in mdoc is currently a challenge.
 
-But that otherwise, this is a good way to keep an eye on performance. During the benchmarking process, I caughrt several small errors. In hot loops, those small errors were adding up to significant frictional losses. It is good to have a way to measure this systemically.
 
 
diff --git a/site/docs/_docs/Setup.md b/site/docs/_docs/Setup.md
@@ -16,21 +16,24 @@ To enable the SIMD implementation, you'll need (at the time of writing) to enabl
 
 # JS
 
-You'll need this or better, available somewhere in your bundle / build / browser / whatever.
+The best way is without a bundler using ESModules. Use this import map to load the module out of JSdelivr.
 
 ```json
 {
-  "dependencies": {
-    "@stdlib/blas": "^0.1.1"
+  "imports": {
+    "@stdlib/blas/base": "https://cdn.jsdelivr.net/npm/@stdlib/blas@0.2.0/base/+esm"
   }
 }
 ```
-If you aren't using a bundler, consider loading vs an ESModule and remapping the imports at link time through your build tool.
 
-Worst case;
+If you gotta bundle, you gotta bundle
 
-```html
-<script type="text/javascript" src="https://cdn.jsdelivr.net/gh/stdlib-js/blas@umd/browser.js"></script>
+```json
+{
+  "dependencies": {
+    "@stdlib/blas": "^0.2.0"
+  }
+}
 ```
 
 # Native

diff --git a/site/docs/_docs/benchmarks/addScalar.md b/site/docs/_docs/benchmarks/addScalar.md
@@ -15,6 +15,6 @@ Here is the comparison of the standard `while` loop with the vectorised version.
 
 # Conclusion
 
-The case here is nuanced. The looped version is significantly faster, for small array sizes.
+To my surprise, the vectorised version is slower than the standard `while` loop, in nearly each case and across the two environments I have to test in.
 
-It could be, that the vectorised version is somehow inefficiently initiated. Whilst the case is more nuanced, I'm targeting larger data sizes, and so the vectorised version is left in.
+It could be, that the JDK is simply very good at optimising this case. Feedback welcomed - but as the standard while loop outperforms the vectorised version, the standard while loop is left included.
diff --git a/site/docs/_docs/benchmarks/increments.md b/site/docs/_docs/benchmarks/increments.md
@@ -17,4 +17,4 @@ Here is the comparison of the standard `while` loop with the vectorised version.
 
 The case here is nuanced. The looped version is significantly faster, for small array sizes.
 
-It could be, that the vectorised version is somehow inefficiently initiated. Whilst the case is more nuanced, I'm targeting larger data sizes, and so the vectorised version is left in.
+It could be, that the vectorised version is somehow inefficiently initiated. Whilst the case is more nuanced, I'm targeting larger data sizes, and so the vectorised version is left in, where it holds a cca 20% throughput advantage.
diff --git a/site/docs/_docs/benchmarks/matmul.md b/site/docs/_docs/benchmarks/matmul.md
@@ -23,34 +23,28 @@ So the benchmark boils down to a personal preference for which of these two piec
 
 
 ```scala sc:nocompile
-extension (vec: Array[Double])
+  val a : Matrix[Double] = ??? // some matrix
+  val b : Matrix[Double] = ??? // some matrix
 
-  @Benchmark
-  def vecxt_mmult(bh: Blackhole)=
-    val cclone = matA @@ matB
-    bh.consume(cclone);
-  end vecxt_mmult
+  //vecxt
+  val multplied = a @@ b
 
-  @Benchmark
-  def java_dgemm(bh: Blackhole) =
-    val cclone = Array.fill[Double](m*n)(0)
+  //blas
+  val multiplied2 =
     blas.dgemm(
-      transa,
-      transb,
-      m,
-      n,
-      k,
-      alpha,
-      a,
-      if transa.equals("N") then m else k,
-      b,
-      if transb.equals("N") then k else n,
-      beta,
-      cclone,
-      m
+      "N",
+      "N",
+      a.rows,
+      b.cols,
+      a.cols,
+      1.0,
+      a.raw,
+      a.rows,
+      b.raw,
+      b.rows,
+      1.0,
+      newArr,
+      a.rows
     );
-    bh.consume(cclone);
-  end java_dgemm
-
-
-```
+```
+It is true, that some amount of flexibility is given up in terms of multiplying transposes etc. If that turns out to be painful, further extension methods could be considered.
diff --git a/site/docs/_docs/performance.md → site/docs/_docs/benchmarks/performance.md b/site/docs/_docs/performance.md → site/docs/_docs/benchmarks/performance.md
@@ -1,8 +1,12 @@
+---
+title: General Performance
+---
+
 # Performance
 
 The core aim of this library, is to provide platform native, friction free performance. I'm confident that I can't do better than this - at least outside of an absurd about of effort.
 
-In general cross platform performance is a hard problem. We sidestep it as far as possible by simply providing compiletime `@inline`-shim-to-BLAS implementations.
+In general cross platform performance is a hard problem. We sidestep where possible by simply providing compiletime `@inline`-shim-to-BLAS implementations.
 
 
 ||JVM|JS|Native|Cross|
@@ -15,18 +19,12 @@ Consider browsing the [[vecxt]] api, and particulaly the extensions object. You'
 
 ### JVM
 
-On the JVM, firstly, we have the JVM. It does a really good job of optimising code.
-
-As we target performance, this library also targets the "project Panama", "Vector", or "SIMD" apis, which aim to provide hardware accelerated performance.
+On the JVM, firstly, we have the JVM. JVM Good. Further this library also targets the "project Panama", "Vector", or "SIMD" apis, which aim to provide hardware accelerated performance. Each function has been benchmarked for performance vs a `while` loop.
 
-The BLAS shim uses that API to hit C levels of performance for BLAS operations.
-
-Where I can benchmark a performance improvement vs a "while" loop, I've begun adding my own vectorised implementation for hot operations.
+The BLAS shim uses that API to hit C levels of performance for BLAS operations and is written by a MS researcher. It's good.
 
 ### JS
 
-On Node, this shim ships with it's own C BLAS implementation.
-
-In browser / elsewhere, it's done in loop-unrolled native arrays.
+On Node, this shim ships with it's own C BLAS implementation. The BLAS implementation it's done in loop-unrolled native arrays.
 
 TODO: Investigate webassembly?
diff --git a/site/docs/_docs/demos/notes.mdoc.md b/site/docs/_docs/demos/notes.mdoc.md
diff --git a/site/docs/_docs/inBrowser.mdoc.md b/site/docs/_docs/inBrowser.mdoc.md
-Original file line number
+Diff line change
@@ Expand Up @@
     # JVM
-    [[vecxt]] is cross platform, this example runs on the JVM, see [Cross Platform](js.mdoc.md) for the same example running in scalaJS.
+    [[vecxt]] is cross platform, this example runs on the JVM, see [Cross Platform](/js.mdoc.md) for the same example running in scalaJS.
     ```scala mdoc
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,4 +17,4 @@ Here is the comparison of the standard `while` loop with the vectorised version.

		The case here is nuanced. The looped version is significantly faster, for small array sizes.

		It could be, that the vectorised version is somehow inefficiently initiated. Whilst the case is more nuanced, I'm targeting larger data sizes, and so the vectorised version is left in.
		It could be, that the vectorised version is somehow inefficiently initiated. Whilst the case is more nuanced, I'm targeting larger data sizes, and so the vectorised version is left in, where it holds a cca 20% throughput advantage.