From e8b5f08392ede52bc992e4ed91bd31a8161cf2c8 Mon Sep 17 00:00:00 2001
From: Ilya Yaroshenko <ilyayaroshenko@gmail.com>
Date: Mon, 17 Oct 2016 16:54:44 +0200
Subject: [PATCH] switch to native complex numbers (#367)

* rework glas

* add const

* use native complex numbers

* remove complex and update sum

* fix style
---
 benchmarks/glas/gemm_bench.d        |  29 +++--
 benchmarks/glas/gemm_report.d       |  28 +++--
 benchmarks/glas/symm_bench.d        |  29 +++--
 source/mir/glas/common.d            |  52 +--------
 source/mir/glas/internal/blocking.d |  32 +++---
 source/mir/glas/internal/config.d   |   1 -
 source/mir/glas/internal/context.d  |  66 ++++++++----
 source/mir/glas/internal/copy.d     |  24 ++---
 source/mir/glas/internal/gemm.d     |  15 ++-
 source/mir/glas/internal/symm.d     |  10 +-
 source/mir/glas/internal/test.d     |   1 -
 source/mir/glas/internal/trmm.d     |   1 -
 source/mir/glas/internal/trsm.d     |   1 -
 source/mir/glas/l1.d                |  85 +++++++--------
 source/mir/glas/l2.d                |  12 +--
 source/mir/glas/l3.d                |  65 +++++-------
 source/mir/internal/utility.d       |  11 +-
 source/mir/sum.d                    | 157 ++++++++++++++++------------
 18 files changed, 287 insertions(+), 332 deletions(-)

diff --git a/benchmarks/glas/gemm_bench.d b/benchmarks/glas/gemm_bench.d
index 67d1f41c..50702483 100644
--- a/benchmarks/glas/gemm_bench.d
+++ b/benchmarks/glas/gemm_bench.d
@@ -2,8 +2,9 @@
 /+ dub.json:
 {
 	"name": "gemm_bench",
-	"dependencies": {"mir": {"path": "../.."}, "cblas": "~>0.1.0"},
+	"dependencies": {"mir": {"path": "../.."}, "cblas": "~>1.0.0"},
 	"dflags-ldc": ["-mcpu=native"],
+	"libs": ["blas"],
 	"lflags": ["-L./"]
 }
 +/
@@ -11,24 +12,23 @@ import std.math;
 import std.traits;
 import std.datetime;
 import std.conv;
-import std.complex;
 import std.algorithm.comparison;
 import std.stdio;
 import std.exception;
 import std.getopt;
 import mir.ndslice;
 import mir.glas;
+import mir.internal.utility : isComplex;
 
 alias C = float;
 //alias C = double;
-//alias C = Complex!float;
-//alias C = Complex!double;
+//alias C = cfloat;
+//alias C = cdouble;
 alias A = C;
 alias B = C;
 
 void main(string[] args)
 {
-	auto glas = new GlasContext;
 	size_t m = 1000;
 	size_t n = size_t.max;
 	size_t k = size_t.max;
@@ -60,10 +60,10 @@ void main(string[] args)
 
 	d[] = c[];
 
-	static if(is(C : Complex!F, F))
+	static if(isComplex!C)
 	{
-		C alpha = C(3, 7);
-		C beta = C(2, 5);
+		C alpha = 3 + 7i;
+		C beta = 2 + 5i;
 	}
 	else
 	{
@@ -71,17 +71,15 @@ void main(string[] args)
 		C beta = 2;
 	}
 
-
 	auto nsecsBLAS = double.max;
 
-
 	foreach(_; 0..count) {
 		StopWatch sw;
 		sw.start;
-		static if(!(is(C == real) || is(C : Complex!real) || is(C : long)))
+		static if(!(is(C == real) || is(C == creal) || is(C : long)))
 		{
 			static import cblas;
-			static if(is(C : Complex!E, E))
+			static if(isComplex!C)
 			cblas.gemm(
 				cblas.Order.RowMajor,
 				cblas.Transpose.NoTrans,
@@ -128,7 +126,7 @@ void main(string[] args)
 	{
 		StopWatch sw;
 		sw.start;
-		glas.gemm(alpha, a, b, beta, c);
+		gemm(alpha, a, b, beta, c);
 		sw.stop;
 		auto newns = sw.peek.to!Duration.total!"nsecs".to!double;
 		//writefln("_GLAS (single thread)               : %5s GFLOPS", (m * n * k * 2) / newns);
@@ -147,10 +145,9 @@ void fillRNG(T)(Slice!(2, T*) sl)
 	import std.random;
 	foreach(ref e; sl.byElement)
 	{
-		static if(is(T : Complex!F, F))
+		static if(isComplex!T)
 		{
-			e.re = cast(F) uniform(-100, 100);
-			e.im = cast(F) uniform(-100, 100);
+			e = uniform(-100, 100) + uniform(-100, 100) * 1i;
 		}
 		else
 		{
diff --git a/benchmarks/glas/gemm_report.d b/benchmarks/glas/gemm_report.d
index ad27b6f6..004735a7 100644
--- a/benchmarks/glas/gemm_report.d
+++ b/benchmarks/glas/gemm_report.d
@@ -2,10 +2,10 @@
 /+ dub.json:
 {
 	"name": "gemm_report",
-	"dependencies": {"mir": {"path": "../.."}, "cblas": "~>0.2.0"},
+	"dependencies": {"mir": {"path": "../.."}, "cblas": "~>1.0.0"},
 	"lflags": ["-L./"],
 	"libs": ["blas"],
-	"dflags-ldc": ["-mcpu=native", "-singleobj"],
+	"dflags-ldc": ["-mcpu=native"],
 }
 +/
 	//"lflags": ["-L/opt/intel/mkl/lib"],
@@ -19,18 +19,18 @@ import std.math;
 import std.traits;
 import std.datetime;
 import std.conv;
-import std.complex;
 import std.algorithm.comparison;
 import std.stdio;
 import std.exception;
 import std.getopt;
 import mir.ndslice;
 import mir.glas;
+import mir.internal.utility : isComplex;
 
 alias C = float;
 //alias C = double;
-//alias C = Complex!float;
-//alias C = Complex!double;
+//alias C = cfloat;
+//alias C = cdouble;
 alias A = C;
 alias B = C;
 
@@ -41,7 +41,6 @@ size_t[] reportValues = [
 
 void main(string[] args)
 {
-	auto glas = new GlasContext;
 	size_t count = 6;
 	auto helpInformation = 
 	getopt(args,
@@ -69,10 +68,10 @@ void main(string[] args)
 
 		d[] = c[];
 
-		static if(is(C : Complex!F, F))
+		static if(isComplex!C)
 		{
-			C alpha = C(3, 7);
-			C beta = C(2, 5);
+			C alpha = 3 + 7i;
+			C beta = 2 + 5i;
 		}
 		else
 		{
@@ -85,10 +84,10 @@ void main(string[] args)
 		foreach(_; 0..count) {
 			StopWatch sw;
 			sw.start;
-			static if(!(is(C == real) || is(C : Complex!real) || is(C : long)))
+			static if(!(is(C == real) || is(C == creal) || is(C : long)))
 			{
 				static import cblas;
-				static if(is(C : Complex!E, E))
+				static if(isComplex!C)
 				cblas.gemm(
 					cblas.Order.RowMajor,
 					cblas.Transpose.NoTrans,
@@ -137,7 +136,7 @@ void main(string[] args)
 		{
 			StopWatch sw;
 			sw.start;
-			glas.gemm(alpha, a, b, beta, c);
+			gemm(alpha, a, b, beta, c);
 			sw.stop;
 			auto newns = sw.peek.to!Duration.total!"nsecs".to!double;
 			//writefln("_GLAS (single thread)               : %5s GFLOPS", (m * n * k * 2) / newns);
@@ -154,10 +153,9 @@ void fillRNG(T)(Slice!(2, T*) sl)
 	import std.random;
 	foreach(ref e; sl.byElement)
 	{
-		static if(is(T : Complex!F, F))
+		static if(isComplex!T)
 		{
-			e.re = cast(F) uniform(-100, 100);
-			e.im = cast(F) uniform(-100, 100);
+			e = uniform(-100, 100) + uniform(-100, 100) * 1i;
 		}
 		else
 		{
diff --git a/benchmarks/glas/symm_bench.d b/benchmarks/glas/symm_bench.d
index 33479d84..8a04938e 100644
--- a/benchmarks/glas/symm_bench.d
+++ b/benchmarks/glas/symm_bench.d
@@ -2,8 +2,9 @@
 /+ dub.json:
 {
 	"name": "symm_bench",
-	"dependencies": {"mir": {"path": "../.."}, "cblas": "~>0.1.0"},
+	"dependencies": {"mir": {"path": "../.."}, "cblas": "~>1.0.0"},
 	"dflags-ldc": ["-mcpu=native"],
+	"libs": ["blas"],
 	"lflags": ["-L./"]
 }
 +/
@@ -11,24 +12,23 @@ import std.math;
 import std.traits;
 import std.datetime;
 import std.conv;
-import std.complex;
 import std.algorithm.comparison;
 import std.stdio;
 import std.exception;
 import std.getopt;
 import mir.ndslice;
 import mir.glas;
+import mir.internal.utility : isComplex;
 
 alias C = float;
 //alias C = double;
-//alias C = Complex!float;
-//alias C = Complex!double;
+//alias C = cfloat;
+//alias C = cdouble;
 alias A = C;
 alias B = C;
 
 void main(string[] args)
 {
-	auto glas = new GlasContext;
 	size_t m = 1000;
 	size_t n = size_t.max;
 	size_t k = size_t.max;
@@ -59,10 +59,10 @@ void main(string[] args)
 
 	d[] = c[];
 
-	static if(is(C : Complex!F, F))
+	static if(isComplex!C)
 	{
-		C alpha = C(3, 7);
-		C beta = C(2, 5);
+		C alpha = 3 + 7i;
+		C beta = 2 + 5i;
 	}
 	else
 	{
@@ -70,17 +70,15 @@ void main(string[] args)
 		C beta = 2;
 	}
 
-
 	auto nsecsBLAS = double.max;
 
-
 	foreach(_; 0..count) {
 		StopWatch sw;
 		sw.start;
-		static if(!(is(C == real) || is(C : Complex!real) || is(C : long)))
+		static if(!(is(C == real) || is(C == creal) || is(C : long)))
 		{
 			static import cblas;
-			static if(is(C : Complex!E, E))
+			static if(isComplex!C)
 			cblas.symm(
 				cblas.Order.RowMajor,
 				cblas.Side.Left,
@@ -125,7 +123,7 @@ void main(string[] args)
 	{
 		StopWatch sw;
 		sw.start;
-		glas.symm(Side.left, Uplo.lower, alpha, a, b, beta, c);
+		symm(Side.left, Uplo.lower, alpha, a, b, beta, c);
 		sw.stop;
 		auto newns = sw.peek.to!Duration.total!"nsecs".to!double;
 		//writefln("_GLAS (single thread)               : %5s GFLOPS", (m * n * m * 2) / newns);
@@ -148,10 +146,9 @@ void fillRNG(T)(Slice!(2, T*) sl)
 	import std.random;
 	foreach(ref e; sl.byElement)
 	{
-		static if(is(T : Complex!F, F))
+		static if(isComplex!T)
 		{
-			e.re = cast(F) uniform(-100, 100);
-			e.im = cast(F) uniform(-100, 100);
+			e = uniform(-100, 100) + uniform(-100, 100) * 1i;
 		}
 		else
 		{
diff --git a/source/mir/glas/common.d b/source/mir/glas/common.d
index 485831c3..d646d483 100644
--- a/source/mir/glas/common.d
+++ b/source/mir/glas/common.d
@@ -9,53 +9,6 @@ Authors: Ilya Yaroshenko
 +/
 module mir.glas.common;
 
-/++
-GLAS Context
-
-Note: `GlasContext` is single thread for now.
-+/
-struct GlasContext
-{
-    import mir.internal.memory;
-    import mir.glas.internal.context;
-    static import cpuid.unified;
-    import core.sync.mutex;
-
-    private
-    {
-        void[] _memory;
-    }
-
-nothrow @nogc:
-
-    ~this()
-    {
-        release;
-    }
-
-    /// Returns: reused unaligned memory chunk
-    nothrow @nogc void[] memory(size_t size)
-    {
-        if (_memory.length < size)
-        {
-            auto f = _memory.length << 1;
-            if (f > size)
-                size = f;
-            if (_memory !is null)
-                deallocate(_memory);
-            _memory = alignedAllocate(size, 4096);
-        }
-        return _memory[0 .. size];
-    }
-
-    /// Releases memory.
-    void release()
-    {
-        if (_memory !is null)
-            deallocate(_memory);
-    }
-}
-
 /++
 Uplo specifies whether a matrix is an upper or lower triangular matrix.
 +/
@@ -153,10 +106,7 @@ package mixin template prefix3()
     enum PB = CB ? 2 : 1;
     enum PC = CC ? 2 : 1;
 
-    static if (is(C : Complex!F, F))
-        alias T = F;
-    else
-        alias T = C;
+    alias T = realType!C;
     static assert(!isComplex!T);
 }
 
diff --git a/source/mir/glas/internal/blocking.d b/source/mir/glas/internal/blocking.d
index 6c39bc9a..00edad47 100644
--- a/source/mir/glas/internal/blocking.d
+++ b/source/mir/glas/internal/blocking.d
@@ -1,11 +1,13 @@
 module mir.glas.internal.blocking;
 
-import std.traits;
+import core.sync.mutex;
 import std.meta;
-import std.complex : Complex;
-import mir.internal.utility;
-import mir.glas.internal.config;
+import std.traits;
 import mir.glas.common;
+import mir.glas.internal.config;
+import mir.glas.internal.context;
+import mir.internal.utility;
+static import cpuid.unified;
 
 import ldc.attributes : fastmath;
 @fastmath:
@@ -20,20 +22,20 @@ struct BlockInfo(T)
     T* b;
 }
 
-BlockInfo!T blocking(size_t PA, size_t PB, size_t PC, T)(GlasContext* ctx, size_t m, size_t n, size_t k)
+BlockInfo!T blocking(size_t PA, size_t PB, size_t PC, T)(size_t m, size_t n, size_t k)
 {
     import mir.glas.internal.context;
     mixin RegisterConfig!(PC, PA, PB, T);
     BlockInfo!T ret = void;
-    sizediff_t l2 = c2.size << 9; // half cache
+    sizediff_t l2 = c2 >> 1; // half cache
     ret.kc = (l2 - m * T[PC][main_nr].sizeof) / (m * T[PA].sizeof + T[PB][main_nr].sizeof);
     ret.mc = m;
     enum minKc = 320 / PC;
-    auto a = 2 * (T[PC][main_nr][main_mr].sizeof + main_nr * c1.line) + 512;
-    if (ret.kc < minKc || ret.kc * (T[PA][main_mr].sizeof + T[PB][main_nr].sizeof) + a  > c1.size << 10)
+    auto a = 2 * (T[PC][main_nr][main_mr].sizeof + main_nr * line) + 512;
+    if (ret.kc < minKc || ret.kc * (T[PA][main_mr].sizeof + T[PB][main_nr].sizeof) + a  > c1)
     {
-        ret.kc = ((c1.size << 10) - a) / (T[PA][main_mr].sizeof + T[PB][main_nr].sizeof);
-        assert(c1.size << 10 > main_mr);
+        ret.kc = (c1 - a) / (T[PA][main_mr].sizeof + T[PB][main_nr].sizeof);
+        assert(c1 > main_mr);
         assert(ret.kc > main_mr);
         ret.kc.normalizeChunkSize!main_mr(k);
         assert(ret.kc > 0);
@@ -48,20 +50,20 @@ BlockInfo!T blocking(size_t PA, size_t PB, size_t PC, T)(GlasContext* ctx, size_
     auto a_length = ret.kc * ret.mc * T[PA].sizeof;
     auto b_length = ret.kc * T[PB].sizeof * (ret.mc == m && false ? main_nr : n);
     auto buffLength = a_length + b_length;
-    auto _mem = ctx.memory(a_length + b_length + prefetchShift);
+    auto _mem = memory(a_length + b_length + prefetchShift);
     ret.a = cast(T*) _mem.ptr;
     ret.b = cast(T*) (_mem.ptr + a_length);
     return ret;
 }
 
-BlockInfo!T blocking_triangular(size_t PA, size_t PB, T)(GlasContext* ctx, size_t m, size_t n)
+BlockInfo!T blocking_triangular(size_t PA, size_t PB, T)(size_t m, size_t n)
 {
     import mir.glas.internal.context;
     mixin RegisterConfig!(PB, PA, PB, T);
     BlockInfo!T ret = void;
 
-    sizediff_t l2 = c2.size << 10; // half matrix
-    //ret.kc = ((c1.size << 10) - 2 * (T[PB][main_nr][main_mr].sizeof + main_nr * c1.line) - 512) / (T[PA][main_nr].sizeof + T[PB][main_mr].sizeof);
+    sizediff_t l2 = c2; // half matrix
+    //ret.kc = (c1 - 2 * (T[PB][main_nr][main_mr].sizeof + main_nr * line) - 512) / (T[PA][main_nr].sizeof + T[PB][main_mr].sizeof);
 
         import std.stdio;
     if (l2 >= (m * ((m + main_nr) * PA + PB * main_mr * 2)) * T.sizeof)
@@ -83,7 +85,7 @@ BlockInfo!T blocking_triangular(size_t PA, size_t PB, T)(GlasContext* ctx, size_
     auto a_length = ret.kc * ret.mc * T[PA].sizeof;
     auto b_length = ret.kc * T[PB].sizeof * (ret.mc == m && false ? main_mr : n);
     auto buffLength = a_length + b_length;
-    auto _mem = ctx.memory(a_length + b_length + prefetchShift);
+    auto _mem = memory(a_length + b_length + prefetchShift);
     ret.b = cast(T*) _mem.ptr;
     ret.a = cast(T*) (_mem.ptr + b_length);
 
diff --git a/source/mir/glas/internal/config.d b/source/mir/glas/internal/config.d
index f3f4d649..e944745d 100644
--- a/source/mir/glas/internal/config.d
+++ b/source/mir/glas/internal/config.d
@@ -2,7 +2,6 @@ module mir.glas.internal.config;
 
 import std.traits;
 import std.meta;
-import std.complex: Complex;
 import mir.internal.utility;
 
 template RegisterConfig(size_t PS, size_t PB, size_t PR, T)
diff --git a/source/mir/glas/internal/context.d b/source/mir/glas/internal/context.d
index 7f461afe..06e756c8 100644
--- a/source/mir/glas/internal/context.d
+++ b/source/mir/glas/internal/context.d
@@ -2,10 +2,38 @@ module mir.glas.internal.context;
 
 import std.range.primitives;
 import cpuid.unified;
+import mir.internal.memory;
 
-__gshared immutable Cache c1;
-__gshared immutable Cache c2;
-__gshared immutable Tlb tlb;
+__gshared uint c1;
+__gshared uint c2;
+__gshared uint line;
+__gshared void[] _memory;
+
+
+import ldc.attributes : fastmath;
+@fastmath:
+
+/// Returns: reused unaligned memory chunk
+nothrow @nogc void[] memory(size_t size)
+{
+    if (_memory.length < size)
+    {
+        auto f = _memory.length << 1;
+        if (f > size)
+            size = f;
+        if (_memory !is null)
+            deallocate(_memory);
+        _memory = alignedAllocate(size, 4096);
+    }
+    return _memory[0 .. size];
+}
+
+/// Releases memory.
+nothrow @nogc void release()
+{
+    if (_memory !is null)
+        deallocate(_memory);
+}
 
 nothrow @nogc
 shared static this()
@@ -19,43 +47,45 @@ shared static this()
 
     enum msg =  "MIR: failed to get CPUID information";
 
-    while (!uc.empty && uc.back.size > (1024 * 32)) // > 32 MB is CPU memory
+    while (!uc.empty && uc.back.size > (1024 * 64)) // > 64 MB is CPU memory
     {
         uc.popBack;
     }
 
     if (dc.length)
     {
-        c1 = dc.front;
+        c1 = dc.front.size;
+        line = dc.front.line;
         dc.popFront;
     }
     else
     if (uc.length)
     {
-        c1 = uc.front;
+        c1 = uc.front.size;
+        line = uc.front.line;
         uc.popFront;
     }
-    else assert(0, msg);
+    else
+    {
+        c1 = 16;
+    }
 
     if (uc.length)
     {
-        c2 = uc.back;
+        c2 = uc.back.size;
     }
     else
     if (dc.length)
     {
-        c2 = dc.back;
-    }
-    else assert(0, msg);
-
-    if (uTlb.length)
-    {
-        tlb = uTlb.back;
+        c2 = dc.back.size;
     }
     else
-    if (dTlb.length)
     {
-        tlb = dTlb.back;
+        c1 = 256;
     }
-    else assert(0, msg);
+
+    c1 <<= 10;
+    c2 <<= 10;
+    if (line == 0)
+        line = 64;
 }
diff --git a/source/mir/glas/internal/copy.d b/source/mir/glas/internal/copy.d
index abb86214..d5e66461 100644
--- a/source/mir/glas/internal/copy.d
+++ b/source/mir/glas/internal/copy.d
@@ -2,7 +2,6 @@ module mir.glas.internal.copy;
 
 import std.traits;
 import std.meta;
-import std.complex : Complex;
 import mir.ndslice.slice : Slice;
 import mir.internal.utility;
 import mir.glas.internal.config;
@@ -12,7 +11,7 @@ import ldc.attributes : fastmath;
 @fastmath:
 
 pragma(inline, true)
-T* pack_b_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length, sizediff_t stride, sizediff_t elemStride, F* from, T* to)
+T* pack_b_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length, sizediff_t stride, sizediff_t elemStride, const(F)* from, T* to)
 {
     if (elemStride == 1)
         return pack_b_dense_nano!(n, P, conj)(length, stride, from, to);
@@ -21,7 +20,7 @@ T* pack_b_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length, sized
 }
 
 pragma(inline, false)
-T* pack_b_sym_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length, Slice!(2, F*) sl, size_t j, size_t i, T* to)
+T* pack_b_sym_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length, Slice!(2, const(F)*) sl, size_t j, size_t i, T* to)
 {
     {
         sizediff_t diff = i - j;
@@ -90,7 +89,7 @@ T* pack_b_sym_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length, S
 }
 
 pragma(inline, false)
-T* pack_b_strided_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length, sizediff_t stride, sizediff_t elemStride, F* from, T* to)
+T* pack_b_strided_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length, sizediff_t stride, sizediff_t elemStride, const(F)* from, T* to)
 {
     enum s = n * P;
     do
@@ -121,7 +120,7 @@ T* pack_b_strided_nano(size_t n, size_t P, bool conj = false, F, T)(size_t lengt
 }
 
 //pragma(inline, false)
-T* pack_b_dense_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length, sizediff_t stride, F* from, T* to)
+T* pack_b_dense_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length, sizediff_t stride, const(F)* from, T* to)
 {
     enum s = n * P;
     do
@@ -164,7 +163,7 @@ T* pack_b_dense_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length,
 }
 
 pragma(inline, true)
-T* pack_a_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length, sizediff_t stride, sizediff_t elemStride, F* from, T* to)
+T* pack_a_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length, sizediff_t stride, sizediff_t elemStride, const(F)* from, T* to)
 {
     if (elemStride == 1)
         return pack_a_dense_nano!(n, P, conj)(length, stride, from, to);
@@ -173,7 +172,7 @@ T* pack_a_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length, sized
 }
 
 //pragma(inline, false)
-T* pack_a_strided_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length, sizediff_t stride, sizediff_t elemStride, F* from, T* to)
+T* pack_a_strided_nano(size_t n, size_t P, bool conj = false, F, T)(size_t length, sizediff_t stride, sizediff_t elemStride, const(F)* from, T* to)
 {
     static if (P == 1)
     {
@@ -201,7 +200,7 @@ T* pack_a_strided_nano(size_t n, size_t P, bool conj = false, F, T)(size_t lengt
 }
 
 //pragma(inline, false)
-T* pack_a_dense_nano(size_t mr, size_t P, bool conj = false, T, F)(size_t length, sizediff_t stride, F* from, T* to)
+T* pack_a_dense_nano(size_t mr, size_t P, bool conj = false, T, F)(size_t length, sizediff_t stride, const(F)* from, T* to)
 {
     do
     {
@@ -254,7 +253,7 @@ T* pack_a_dense_nano(size_t mr, size_t P, bool conj = false, T, F)(size_t length
 }
 
 pragma(inline, false)
-void pack_a(size_t PA, size_t PB, size_t PC, bool conj = false, T, C)(Slice!(2, C*) sl, T* a)
+void pack_a(size_t PA, size_t PB, size_t PC, bool conj = false, T, C)(Slice!(2, const(C)*) sl, T* a)
 {
     import mir.ndslice.iteration: transposed;
     mixin RegisterConfig!(PC, PA, PB, T);
@@ -281,7 +280,7 @@ void pack_a(size_t PA, size_t PB, size_t PC, bool conj = false, T, C)(Slice!(2,
 }
 
 pragma(inline, false)
-void pack_a_sym(size_t PA, size_t PB, size_t PC, bool conj = false, T, F)(Slice!(2, F*) sl, size_t i, size_t t, size_t mc, size_t kc, T* to)
+void pack_a_sym(size_t PA, size_t PB, size_t PC, bool conj = false, T, F)(Slice!(2, const(F)*) sl, size_t i, size_t t, size_t mc, size_t kc, T* to)
 {
     import mir.ndslice.iteration: transposed, reversed;
     mixin RegisterConfig!(PC, PA, PB, T);
@@ -375,7 +374,7 @@ void pack_a_sym(size_t PA, size_t PB, size_t PC, bool conj = false, T, F)(Slice!
 
 
 //pragma(inline, false)
-void pack_b_triangular(Uplo uplo, bool inverseDiagonal, size_t PA, size_t PB, size_t PC, T, C)(Slice!(2, C*) sl, T* b)
+void pack_b_triangular(Uplo uplo, bool inverseDiagonal, size_t PA, size_t PB, size_t PC, T, C)(Slice!(2, const(C)*) sl, T* b)
 {
     assert(sl.length!0 == sl.length!1);
     import mir.ndslice.iteration: transposed;
@@ -424,7 +423,7 @@ void pack_b_triangular(Uplo uplo, bool inverseDiagonal, size_t PA, size_t PB, si
     while (!nri && sl.length >= nr);
 }
 
-void load_simd(size_t mr, size_t P, T)(T* to, T[P]* from)
+void load_simd(size_t mr, size_t P, T)(T* to, const(T[P])* from)
 {
     static if (mr > 1 && !is(T == real))
     {
@@ -455,7 +454,6 @@ void load_simd(size_t mr, size_t P, T)(T* to, T[P]* from)
 //void pack_b(size_t PA, size_t PB, size_t PC, T, C)(Slice!(2, C*) sl, T* b)
 //{
 //    import mir.ndslice.iteration: transposed;
-//    import std.complex: Complex;
 //    mixin RegisterConfig!(PC, PA, PB, T);
 //    if (sl.stride!0 == 1)
 //    {
diff --git a/source/mir/glas/internal/gemm.d b/source/mir/glas/internal/gemm.d
index ada6ad0a..9cd8f894 100644
--- a/source/mir/glas/internal/gemm.d
+++ b/source/mir/glas/internal/gemm.d
@@ -1,7 +1,6 @@
 module mir.glas.internal.gemm;
 
 import std.traits;
-import std.complex;
 import std.meta;
 
 public import mir.glas.common;
@@ -18,10 +17,9 @@ pragma(inline, false)
 //nothrow @nogc
 void gemm_impl(A, B, C)
 (
-    GlasContext* ctx,
     C alpha,
-        Slice!(2, A*) asl,
-        Slice!(2, B*) bsl,
+        Slice!(2, const(A)*) asl,
+        Slice!(2, const(B)*) bsl,
     C beta,
         Slice!(2, C*) csl,
     Conjugated conja,
@@ -60,7 +58,7 @@ void gemm_impl(A, B, C)
         }
         else
         {
-            ctx.gemm!(B, A, C)(alpha, bsl.transposed, asl.transposed, beta, csl.transposed, conjb, conja);
+            gemm_impl!(B, A, C)(alpha, bsl.transposed, asl.transposed, beta, csl.transposed, conjb, conja);
             return;
         }
     }
@@ -110,7 +108,7 @@ void gemm_impl(A, B, C)
             beta_kernels[nri] = &gemv_reg!(BetaType.beta, PA, PB, PC, nr, T);
         kernels = beta_kernels.ptr;
     }
-    auto bl = blocking!(PA, PB, PC, T)(ctx, asl.length!0, bsl.length!1, asl.length!1);
+    auto bl = blocking!(PA, PB, PC, T)(asl.length!0, bsl.length!1, asl.length!1);
     with(bl)
     {
         sizediff_t incb;
@@ -426,9 +424,8 @@ void scale_nano(size_t M, size_t P, size_t N, V, F)(ref const F[P] alpha, ref V[
 pragma(inline, true)
 ref auto castByRef(C)(return ref C val)
 {
-    import std.complex: Complex;
-    static if (is(C : Complex!T, T))
-        alias R = T[2];
+    static if (isComplex!C)
+        alias R = typeof(val.re)[2];
     else
         alias R = C[1];
     return *cast(R*) &val;
diff --git a/source/mir/glas/internal/symm.d b/source/mir/glas/internal/symm.d
index 7dd574c4..2a15cb10 100644
--- a/source/mir/glas/internal/symm.d
+++ b/source/mir/glas/internal/symm.d
@@ -1,7 +1,6 @@
 module mir.glas.internal.symm;
 
 import std.traits;
-import std.complex;
 import std.meta;
 
 public import mir.glas.common;
@@ -17,10 +16,9 @@ pragma(inline, false)
 //nothrow @nogc
 void symm_impl(A, B, C)
 (
-    GlasContext* ctx,
     C alpha,
-        Slice!(2, A*) asl,
-        Slice!(2, B*) bsl,
+        Slice!(2, const(A)*) asl,
+        Slice!(2, const(B)*) bsl,
     C beta,
         Slice!(2, C*) csl,
     Conjugated conja,
@@ -114,7 +112,7 @@ void symm_impl(A, B, C)
             }
         }
 
-        auto bl = blocking!(PA, PB, PC, T)(ctx, asl.length!0, bsl.length!1, asl.length!0);
+        auto bl = blocking!(PA, PB, PC, T)(asl.length!0, bsl.length!1, asl.length!0);
         size_t j;
         sizediff_t incb;
         if (bl.mc  < asl.length!0)
@@ -195,7 +193,7 @@ void symm_impl(A, B, C)
         bsl = bsl.transposed;
         csl = csl.transposed;
         assert(csl.stride!0 == 1);
-        auto bl = blocking!(PB, PA, PC, T)(ctx, bsl.length!0, asl.length!0, bsl.length!1);
+        auto bl = blocking!(PB, PA, PC, T)(bsl.length!0, asl.length!0, bsl.length!1);
         sizediff_t incb;
         if (bl.mc < bsl.length!0)
             incb = bl.kc;
diff --git a/source/mir/glas/internal/test.d b/source/mir/glas/internal/test.d
index 407a7c10..999048b3 100644
--- a/source/mir/glas/internal/test.d
+++ b/source/mir/glas/internal/test.d
@@ -2,7 +2,6 @@ module mir.glas.internal.test;
 
 import std.traits;
 import std.meta;
-import std.complex;
 import std.algorithm: min, max;
 import mir.ndslice;
 import mir.glas;
diff --git a/source/mir/glas/internal/trmm.d b/source/mir/glas/internal/trmm.d
index 858ea1b8..88e8c136 100644
--- a/source/mir/glas/internal/trmm.d
+++ b/source/mir/glas/internal/trmm.d
@@ -1,7 +1,6 @@
 module mir.glas.trmm;
 
 import std.traits;
-import std.complex;
 import std.meta;
 
 version(none):
diff --git a/source/mir/glas/internal/trsm.d b/source/mir/glas/internal/trsm.d
index c9f017f3..04ec4061 100644
--- a/source/mir/glas/internal/trsm.d
+++ b/source/mir/glas/internal/trsm.d
@@ -1,7 +1,6 @@
 module mir.glas.trsm;
 
 import std.traits;
-import std.complex;
 import std.meta;
 
 version(none):
diff --git a/source/mir/glas/l1.d b/source/mir/glas/l1.d
index 0a81723d..e9644b30 100644
--- a/source/mir/glas/l1.d
+++ b/source/mir/glas/l1.d
@@ -80,7 +80,6 @@ unittest
 
 import std.traits;
 import std.meta;
-import std.complex : Complex, conj;
 import std.typecons: Flag, Yes, No;
 
 import mir.internal.math;
@@ -100,11 +99,15 @@ template _rot(alias c, alias s)
         auto y = yr;
         auto t1 = c * x + s * y;
         static if (isComplex!(typeof(c)))
-            auto t2 = conj(c) * y;
+        {
+            auto t2 = (c.re - c.im * 1fi) * y;
+        }
         else
             auto t2 = c * y;
         static if (isComplex!(typeof(s)))
-            t2 -= conj(s) * x;
+        {
+            t2 -= (s.re - s.im * 1fi) * x;
+        }
         else
             t2 -= s * x;
         xr = t1;
@@ -130,7 +133,7 @@ A _fmuladdc(A, B, C)(A a, in B b, in C c)
 {
     static if (isComplex!B)
     {
-        return a + conj(b) * c;
+        return a + (b.re - b.im * 1fi) * c;
     }
     else
         return a + b * c;
@@ -263,16 +266,14 @@ unittest
 unittest
 {
     import mir.ndslice.slice: slice;
-    import std.complex;
-    alias cd = Complex!double;
-
-    auto a = cd(3, 4);
-    auto x = slice!cd(2);
-    auto y = slice!cd(2);
-    x[] = [cd(0, 1), cd(2, 3)];
-    y[] = [cd(4, 5), cd(6, 7)];
+
+    auto a = 3 + 4i;
+    auto x = slice!cdouble(2);
+    auto y = slice!cdouble(2);
+    x[] = [0 + 1i, 2 + 3i];
+    y[] = [4 + 5i, 6 + 7i];
     axpy(a, x, y);
-    assert(y == [a * cd(0, 1) + cd(4, 5), a * cd(2, 3) + cd(6, 7)]);
+    assert(y == [a * (0 + 1i) + (4 + 5i), a * (2 + 3i) + (6 + 7i)]);
 }
 
 /++
@@ -295,7 +296,7 @@ F dot(F, size_t N, R1, R2)(Slice!(N, R1) x, Slice!(N, R2) y)
     {
         assert(x.shape == y.shape, "constraints: x and y must have equal shapes");
         pragma(inline, false);
-        return ndReduce!(_fmuladd, Yes.vectorized)(F(0), x, y);
+        return ndReduce!(_fmuladd, Yes.vectorized)(cast(F)(0), x, y);
     }
 }
 
@@ -342,14 +343,12 @@ unittest
 unittest
 {
     import mir.ndslice.slice: slice;
-    import std.complex;
-    alias cd = Complex!double;
 
-    auto x = slice!cd(2);
-    auto y = slice!cd(2);
-    x[] = [cd(0, 1), cd(2, 3)];
-    y[] = [cd(4, 5), cd(6, 7)];
-    assert(dot(x, y) == cd(0, 1) * cd(4, 5) + cd(2, 3) * cd(6, 7));
+    auto x = slice!cdouble(2);
+    auto y = slice!cdouble(2);
+    x[] = [0 + 1i, 2 + 3i];
+    y[] = [4 + 5i, 6 + 7i];
+    assert(dot(x, y) == (0 + 1i) * (4 + 5i) + (2 + 3i) * (6 + 7i));
 }
 
 /++
@@ -373,7 +372,7 @@ F dotc(F, size_t N, R1, R2)(Slice!(N, R1) x, Slice!(N, R2) y)
     {
         assert(x.shape == y.shape, "constraints: x and y must have equal shapes");
         pragma(inline, false);
-        return ndReduce!(_fmuladdc, Yes.vectorized)(F(0), x, y);
+        return ndReduce!(_fmuladdc, Yes.vectorized)(cast(F)(0), x, y);
     }
 }
 
@@ -387,14 +386,12 @@ auto dotc(size_t N, R1, R2)(Slice!(N, R1) x, Slice!(N, R2) y)
 unittest
 {
     import mir.ndslice.slice: slice;
-    import std.complex;
-    alias cd = Complex!double;
 
-    auto x = slice!cd(2);
-    auto y = slice!cd(2);
-    x[] = [cd(0, 1), cd(2, 3)];
-    y[] = [cd(4, 5), cd(6, 7)];
-    assert(dotc(x, y) == cd(0, -1) * cd(4, 5) + cd(2, -3) * cd(6, 7));
+    auto x = slice!cdouble(2);
+    auto y = slice!cdouble(2);
+    x[] = [0 + 1i, 2 + 3i];
+    y[] = [4 + 5i, 6 + 7i];
+    assert(dotc(x, y) == (0 + -1i) * (4 + 5i) + (2 + -3i) * (6 + 7i));
 }
 
 /++
@@ -435,11 +432,9 @@ unittest
 {
     import mir.ndslice.slice: slice;
     import std.math: sqrt, approxEqual;
-    import std.complex;
-    alias cd = Complex!double;
 
-    auto x = slice!cd(2);
-    x[] = [cd(0, 1), cd(2, 3)];
+    auto x = slice!cdouble(2);
+    x[] = [0 + 1i, 2 + 3i];
 
     assert(nrm2(x).approxEqual(sqrt(1.0 + 4 + 9)));
 }
@@ -484,11 +479,9 @@ unittest
 unittest
 {
     import mir.ndslice.slice: slice;
-    import std.complex;
-    alias cd = Complex!double;
 
-    auto x = slice!cd(2);
-    x[] = [cd(0, 1), cd(2, 3)];
+    auto x = slice!cdouble(2);
+    x[] = [0 + 1i, 2 + 3i];
 
     assert(sqnrm2(x) == 1.0 + 4 + 9);
 }
@@ -535,11 +528,9 @@ unittest
 unittest
 {
     import mir.ndslice.slice: slice;
-    import std.complex;
-    alias cd = Complex!double;
 
-    auto x = slice!cd(2);
-    x[] = [cd(0, -1), cd(-2, 3)];
+    auto x = slice!cdouble(2);
+    x[] = [0 - 1i, -2 + 3i];
 
     assert(asum(x) == 1 + 2 + 3);
 }
@@ -619,12 +610,10 @@ unittest
 unittest
 {
     import mir.ndslice.slice: slice;
-    import std.complex;
-    alias cd = Complex!double;
 
-    auto x = slice!cd(4);
+    auto x = slice!cdouble(4);
     //        0          1          2         3
-    x[] = [cd(0, -1), cd(-2, 3), cd(2, 3), cd(2, 2)];
+    x[] = [0 + -1i, -2 + 3i, 2 + 3i, 2 + 2i];
 
     assert(iamax(x) == 1);
     // -1 for empty vectors
@@ -670,11 +659,9 @@ unittest
 unittest
 {
     import mir.ndslice.slice: slice;
-    import std.complex;
-    alias cd = Complex!double;
 
-    auto x = slice!cd(4);
-    x[] = [cd(0, -1), cd(-7, 3), cd(2, 3), cd(2, 2)];
+    auto x = slice!cdouble(4);
+    x[] = [0 + -1i, -7 + 3i, 2 + 3i, 2 + 2i];
 
     assert(amax(x) == 10);
     // 0 for empty vectors
diff --git a/source/mir/glas/l2.d b/source/mir/glas/l2.d
index fc8d3bb5..0d5efb95 100644
--- a/source/mir/glas/l2.d
+++ b/source/mir/glas/l2.d
@@ -30,7 +30,6 @@ public import mir.glas.common;
 
 import std.traits;
 import std.meta;
-import std.complex : Complex, conj;
 import std.typecons: Flag, Yes, No;
 
 import mir.internal.math;
@@ -50,7 +49,6 @@ Performs general matrix-vector multiplication.
 Pseudo_code: `y := alpha A × x + beta y`.
 
 Params:
-    ctx = GLAS context. Should not be accessed by other threads.
     alpha = scalar
     asl = `m ⨉ n` matrix
     xsl = `n ⨉ 1` vector
@@ -67,13 +65,12 @@ BLAS: SGEMV, DGEMV, (CGEMV, ZGEMV are not implemented for now)
 
 See_also: $(SUBREF common, Conjugated).
 +/
-nothrow @nogc
+nothrow @nogc @system
 void gemv(A, B, C)
 (
-    GlasContext* ctx,
     C alpha,
-        Slice!(2, A*) asl,
-        Slice!(1, B*) xsl,
+        Slice!(2, const(A)*) asl,
+        Slice!(1, const(B)*) xsl,
     C beta,
         Slice!(1, C*) ysl,
     Conjugated conja = Conjugated.no,
@@ -141,8 +138,7 @@ unittest
 
     auto c = slice!double(3);
 
-    auto glas = new GlasContext;
-    glas.gemv!(double, double, double)(1.0, a, b, 0.0, c);
+    gemv!(double, double, double)(1.0, a, b, 0.0, c);
 
     assert(c ==
         [-42.0,
diff --git a/source/mir/glas/l3.d b/source/mir/glas/l3.d
index 8a731e50..9165186b 100644
--- a/source/mir/glas/l3.d
+++ b/source/mir/glas/l3.d
@@ -32,7 +32,6 @@ public import mir.glas.common;
 
 import std.traits;
 import std.meta;
-import std.complex: Complex;
 import mir.ndslice.slice;
 import mir.internal.utility;
 
@@ -45,7 +44,6 @@ Performs general matrix-matrix multiplication.
 Pseudo_code: `C := alpha A × B + beta C`.
 
 Params:
-    ctx = GLAS context. Should not be accessed by other threads.
     alpha = scalar
     asl = `m ⨉ k` matrix
     bsl = `k ⨉ n` matrix
@@ -67,12 +65,11 @@ pragma(inline, true)
 nothrow @nogc
 void gemm(A, B, C)
 (
-    GlasContext* ctx,
     C alpha,
-        Slice!(2, A*) asl,
-        Slice!(2, B*) bsl,
+        Slice!(2, const(A)*) asl,
+        Slice!(2, const(B)*) bsl,
     C beta,
-        Slice!(2, C*) csl,
+        Slice!(2, C*)        csl,
     Conjugated conja = Conjugated.no,
     Conjugated conjb = Conjugated.no,
 )
@@ -91,7 +88,6 @@ body
     static assert(is(Unqual!C == C), msgWrongType);
     import mir.glas.internal.gemm: gemm_impl;
     gemm_impl(
-        ctx,
         alpha,
             cast(Slice!(2, Unqual!A*)) asl,
             cast(Slice!(2, Unqual!B*)) bsl,
@@ -123,8 +119,7 @@ unittest
 
     auto c = slice!double(3, 4);
 
-    auto glas = new GlasContext;
-    glas.gemm(1.0, a, b, 0.0, c);
+    gemm(1.0, a, b, 0.0, c);
 
     assert(c ==
         [[-42.0,  35,  -7, 77],
@@ -138,8 +133,7 @@ unittest
     auto b = slice!double(0, 4);
     auto c = slice!double(3, 4);
 
-    auto glas = new GlasContext;
-    glas.gemm(1.0, a, b, 0.0, c);
+    gemm(1.0, a, b, 0.0, c);
 
     assert(c ==
         [[0.0, 0, 0, 0],
@@ -147,7 +141,7 @@ unittest
          [0.0, 0, 0, 0]]);
 
     c[] = 2;
-    glas.gemm(1.0, a, b, 2, c);
+    gemm(1.0, a, b, 2, c);
 
     assert(c ==
         [[4.0, 4, 4, 4],
@@ -163,7 +157,6 @@ Pseudo_code: `C := alpha A × B + beta C` or `C := alpha B × A + beta C`,
     `C` are `m × n` matrices.
 
 Params:
-    ctx = GLAS context. Should not be accessed by other threads.
     side = specifies whether the symmetric matrix A
            appears on the  left or right  in the  operation.
     uplo = specifies  whether  the  upper  or  lower triangular
@@ -198,14 +191,13 @@ pragma(inline, true)
 nothrow @nogc
 void symm(A, B, C)
 (
-    GlasContext* ctx,
     Side side,
     Uplo uplo,
     C alpha,
-        Slice!(2, A*) asl,
-        Slice!(2, B*) bsl,
+        Slice!(2, const(A)*) asl,
+        Slice!(2, const(B)*) bsl,
     C beta,
-        Slice!(2, C*) csl,
+        Slice!(2, C*)        csl,
     Conjugated conja = Conjugated.no,
     Conjugated conjb = Conjugated.no,
 )
@@ -238,7 +230,6 @@ body
 
     import mir.glas.internal.symm: symm_impl;
     symm_impl(
-        ctx,
         alpha,
             cast(Slice!(2, Unqual!A*)) asl,
             cast(Slice!(2, Unqual!B*)) bsl,
@@ -268,8 +259,7 @@ unittest
 
     auto c = slice!double(3, 4);
 
-    auto glas = new GlasContext;
-    glas.symm(Side.left, Uplo.lower, 1.0, a, b, 0.0, c);
+    symm(Side.left, Uplo.lower, 1.0, a, b, 0.0, c);
 
     assert(c ==
         [[ 38,  23,  20,   2],
@@ -281,30 +271,26 @@ unittest
 unittest
 {
     import mir.ndslice;
-    import std.complex;
-    alias cd = Complex!double;
 
-    auto a = slice!cd(3, 3);
+    auto a = slice!cdouble(3, 3);
     a[] =
-        [[cd(-2, 0),   cd.init,   cd.init],
-         [cd(+3, 2), cd(-5, 0),   cd.init],
-         [cd(-4, 7), cd(-2, 3), cd(-3, 0)]];
+        [[-2 + 0i, cdouble.init, cdouble.init],
+         [+3 + 2i, -5 + 0i, cdouble.init],
+         [-4 + 7i, -2 + 3i, -3 + 0i]];
 
-    auto b = slice!cd(3, 4);
+    auto b = slice!cdouble(3, 4);
     b[] =
-        [[cd(-5, 3), cd(-3, 9), cd( 3, 2), cd(1, 2)],
-         [cd( 4, 5), cd( 3, 4), cd( 6, 5), cd(4, 9)],
-         [cd(-4, 2), cd(-2, 2), cd(-2, 7), cd(2, 6)]];
+        [[-5 + 3i, -3 + 9i,  3 + 2i, 1 + 2i],
+         [ 4 + 5i,  3 + 4i,  6 + 5i, 4 + 9i],
+         [-4 + 2i, -2 + 2i, -2 + 7i, 2 + 6i]];
 
-    auto c = slice!cd(3, 4);
-    auto d = slice!cd(3, 4);
+    auto c = slice!cdouble(3, 4);
+    auto d = slice!cdouble(3, 4);
 
-    auto glas = new GlasContext;
+    symm(Side.left, Uplo.lower, 1 + 0i, a, b, 0 + 0i, c, Conjugated.yes);
 
-    glas.symm(Side.left, Uplo.lower, cd(1.0), a, b, cd(0.0), c, Conjugated.yes);
-
-    ndEach!((ref a, ref b){a = conj(b);}, Select.triangular)(a, a.transposed);
-    glas.gemm(cd(1.0), a, b, cd(0.0), d);
+    ndEach!((ref a, ref b){a = (b.re - b.im * 1fi);}, Select.triangular)(a, a.transposed);
+    gemm(1 + 0i, a, b, 0 + 0i, d);
 
     assert(c == d);
 }
@@ -315,8 +301,7 @@ unittest
     auto b = slice!double(3, 4);
     auto c = slice!double(3, 4);
 
-    auto glas = new GlasContext;
-    glas.symm(Side.left, Uplo.lower, 0.0, a, b, 0.0, c);
+    symm(Side.left, Uplo.lower, 0.0, a, b, 0.0, c);
 
     assert(c ==
         [[0.0, 0, 0, 0],
@@ -324,7 +309,7 @@ unittest
          [0.0, 0, 0, 0]]);
 
     c[] = 2;
-    glas.symm(Side.left, Uplo.upper, 0.0, a, b, 2, c);
+    symm(Side.left, Uplo.upper, 0.0, a, b, 2, c);
 
     assert(c ==
         [[4.0, 4, 4, 4],
diff --git a/source/mir/internal/utility.d b/source/mir/internal/utility.d
index 0c0d89a4..4ca49f47 100644
--- a/source/mir/internal/utility.d
+++ b/source/mir/internal/utility.d
@@ -22,17 +22,18 @@ enum isSimpleSlice(S) = is(S : Slice!(N1, T1[]), size_t N1,T1) || is(S : Slice!(
 
 template realType(C)
 {
-    import std.complex : Complex;
-    static if (is(C : Complex!F, F))
-        alias realType = Unqual!F;
+    static if (isComplex!C)
+        alias realType = typeof(Unqual!C.init.re);
     else
         alias realType = Unqual!C;
 }
 
 template isComplex(C)
 {
-    import std.complex : Complex;
-    enum bool isComplex = is(C : Complex!F, F);
+    enum bool isComplex
+     = is(Unqual!C == creal)
+    || is(Unqual!C == cdouble)
+    || is(Unqual!C == cfloat);
 }
 
 //enum isSIMDVector(V) = is(V : __vector(F[N]), F, size_t N);
diff --git a/source/mir/sum.d b/source/mir/sum.d
index 88558a92..7bbc3eb7 100644
--- a/source/mir/sum.d
+++ b/source/mir/sum.d
@@ -7,6 +7,8 @@ Authors: Ilya Yaroshenko
 */
 module mir.sum;
 
+import ldc.attributes: fastmath;
+
 ///
 unittest
 {
@@ -90,12 +92,11 @@ All summation algorithms available for complex numbers.
 +/
 unittest
 {
-    import std.complex;
-    Complex!double[] ar = [complex(1.0, 2), complex(2, 3), complex(3, 4), complex(4, 5)];
-    Complex!double r = complex(10, 14);
+    cdouble[] ar = [1.0 + 2i, 2 + 3i, 3 + 4i, 4 + 5i];
+    cdouble r = 10 + 14i;
     assert(r == ar.sum!(Summation.fast));
     assert(r == ar.sum!(Summation.naive));
-    //assert(r == ar.sum!(Summation.pairwise));
+    assert(r == ar.sum!(Summation.pairwise));
     assert(r == ar.sum!(Summation.kahan));
     assert(r == ar.sum!(Summation.kbn));
     assert(r == ar.sum!(Summation.kb2));
@@ -246,6 +247,7 @@ import std.traits;
 import std.typecons;
 import std.range.primitives;
 import std.math: isInfinity, isFinite, isNaN, signbit;
+import mir.internal.utility: isComplex;
 
 
 /++
@@ -357,7 +359,7 @@ enum Summation
     naive,
 
     /++
-    SIMD optimized summation algorithm. It is identical to `naive` for now.
+    SIMD optimized summation algorithm.
     +/
     fast,
 }
@@ -369,6 +371,17 @@ struct Summator(T, Summation summation)
     if (isMutable!T)
 {
     import std.math: fabs;
+    //import mir.internal.math: fabs;
+
+    static if (summation == Summation.fast)
+        alias attr = fastmath;
+    else
+    {
+        import std.meta: AliasSeq;
+        alias attr = AliasSeq!();
+    }
+
+    @attr:
 
     static if (summation != Summation.pairwise)
         @disable this();
@@ -560,20 +573,34 @@ public:
         static if (summation == Summation.kb2)
         {
             s = n;
-            cs = 0.0;
-            ccs = 0.0;
+            static if (isComplex!T)
+            {
+                cs = 0 + 0fi;
+                ccs = 0 + 0fi;
+            }
+            else
+            {
+                cs = 0.0;
+                ccs = 0.0;
+            }
         }
         else
         static if (summation == Summation.kbn)
         {
             s = n;
-            c = 0.0;
+            static if (isComplex!T)
+                c = 0 + 0fi;
+            else
+                c = 0.0;
         }
         else
         static if (summation == Summation.kahan)
         {
             s = n;
-            c = 0.0;
+            static if (isComplex!T)
+                c = 0 + 0fi;
+            else
+                c = 0.0;
         }
         else
         static if (summation == Summation.pairwise)
@@ -719,29 +746,33 @@ public:
                 F t = s + x;
                 if (fabs(s.re) < fabs(x.re))
                 {
-                    auto t_re = s.re;
-                    s.re = x.re;
-                    x.re = t_re;
+                    auto s_re = s.re;
+                    auto x_re = x.re;
+                    s = x_re + s.im * 1fi;
+                    x = s_re + x.im * 1fi;
                 }
                 if (fabs(s.im) < fabs(x.im))
                 {
-                    auto t_im = s.im;
-                    s.im = x.im;
-                    x.im = t_im;
+                    auto s_im = s.im;
+                    auto x_im = x.im;
+                    s = s.re + x_im * 1fi;
+                    x = x.re + s_im * 1fi;
                 }
                 F c = (s-t)+x;
                 s = t;
                 if (fabs(cs.re) < fabs(c.re))
                 {
-                    auto t_re = cs.re;
-                    cs.re = c.re;
-                    c.re = t_re;
+                    auto c_re = c.re;
+                    auto cs_re = cs.re;
+                    c = cs_re + c.im * 1fi;
+                    cs = c_re + cs.im * 1fi;
                 }
                 if (fabs(cs.im) < fabs(c.im))
                 {
-                    auto t_im = cs.im;
-                    cs.im = c.im;
-                    c.im = t_im;
+                    auto c_im = c.im;
+                    auto cs_im = cs.im;
+                    c = c.re + cs_im * 1fi;
+                    cs = cs.re + c_im * 1fi;
                 }
                 F d = cs - t;
                 d += c;
@@ -774,15 +805,17 @@ public:
                 F t = s + x;
                 if (fabs(s.re) < fabs(x.re))
                 {
-                    auto t_re = s.re;
-                    s.re = x.re;
-                    x.re = t_re;
+                    auto s_re = s.re;
+                    auto x_re = x.re;
+                    s = x_re + s.im * 1fi;
+                    x = s_re + x.im * 1fi;
                 }
                 if (fabs(s.im) < fabs(x.im))
                 {
-                    auto t_im = s.im;
-                    s.im = x.im;
-                    x.im = t_im;
+                    auto s_im = s.im;
+                    auto x_im = x.im;
+                    s = s.re + x_im * 1fi;
+                    x = x.re + s_im * 1fi;
                 }
                 F d = s - t;
                 d += x;
@@ -1462,20 +1495,34 @@ public:
         static if (summation == Summation.kb2)
         {
             s = rhs;
-            cs = 0.0;
-            ccs = 0.0;
+            static if (isComplex!T)
+            {
+                cs = 0 + 0fi;
+                ccs = 0 + 0fi;
+            }
+            else
+            {
+                cs = 0.0;
+                ccs = 0.0;
+            }
         }
         else
         static if (summation == Summation.kbn)
         {
             s = rhs;
-            c = 0.0;
+            static if (isComplex!T)
+                c = 0 + 0fi;
+            else
+                c = 0.0;
         }
         else
         static if (summation == Summation.kahan)
         {
             s = rhs;
-            c = 0.0;
+            static if (isComplex!T)
+                c = 0 + 0fi;
+            else
+                c = 0.0;
         }
         else
         static if (summation == Summation.pairwise)
@@ -1816,33 +1863,10 @@ unittest
         foreach (t; test[0]) summator.put(t);
         auto r = test[1];
         auto s = summator.sum;
-        import core.exception: AssertError;
-        try
-        {
-            version(X86)
-            {
-                import std.math: nextDown, nextUp;
-                assert(summator.isNaN() == r.isNaN());
-                assert(summator.isFinite() == r.isFinite() || r == -double.max
-                       && s == -double.infinity || r == double.max && s == double.infinity);
-                assert(summator.isInfinity() == r.isInfinity() || r == -double.max
-                       && s == -double.infinity || r == double.max && s == double.infinity);
-                assert(nextDown(s) <= r && r <= nextUp(s) || s.isNaN && r.isNaN);
-            }
-            else
-            {
-                assert(summator.isNaN() == r.isNaN());
-                assert(summator.isFinite() == r.isFinite());
-                assert(summator.isInfinity() == r.isInfinity());
-                assert(s == r || s.isNaN && r.isNaN);
-            }
-        }
-        catch(AssertError e)
-        {
-            import std.stdio;
-            stderr.writefln("i = %s, result = %s (%A), target = %s (%A)", i, s, s, r, r);
-            throw e;
-        }
+        assert(summator.isNaN() == r.isNaN());
+        assert(summator.isFinite() == r.isFinite());
+        assert(summator.isInfinity() == r.isInfinity());
+        assert(s == r || s.isNaN && r.isNaN);
     }
 }
 
@@ -2030,7 +2054,7 @@ unittest
 /++
 Precise summation.
 +/
-private F sumPrecise(Range, F)(Range r, F seed = 0.0)
+private F sumPrecise(Range, F)(Range r, F seed = summationInitValue!F)
     if (isFloatingPoint!F || isComplex!F)
 {
     static if (isFloatingPoint!F)
@@ -2053,7 +2077,7 @@ private F sumPrecise(Range, F)(Range r, F seed = 0.0)
             sumRe.put(e.re);
             sumIm.put(e.im);
         }
-        return F(sumRe.sum, sumIm.sum);
+        return sumRe.sum + sumIm.sum * 1fi;
     }
 }
 
@@ -2104,10 +2128,15 @@ private T summationInitValue(T)()
         return a;
     }
     else
+    static if (__traits(compiles, {T a = 0 + 0fi;}))
+    {
+        T a = 0 + 0fi;
+        return a;
+    }
+    else
     {
         return T.init;
     }
-
 }
 
 private template sumType(Range)
@@ -2147,12 +2176,6 @@ unittest
     static assert(isSummable!(typeof(iota(ulong.init, ulong.init, ulong.init)), double));
 }
 
-template isComplex(C)
-{
-    import std.complex : Complex;
-    enum bool isComplex = is(C : Complex!F, F);
-}
-
 private enum bool isCompesatorAlgorithm(Summation summation) =
     summation == Summation.precise
  || summation == Summation.kb2