diff --git a/Install.sh b/Install.sh
index 7779f52f0..657cf2db8 100755
--- a/Install.sh
+++ b/Install.sh
@@ -3,8 +3,7 @@
 #=========================================================
 # [Note] Set the destination path for installation in Ins_dest
 #----------------------------------------------
-# Ins_dest="/usr/local/cytnx"
-Ins_dest="~/Cytnx_lib"
+Ins_dest="/usr/local/cytnx"
 FLAG="${FLAG} -DCMAKE_INSTALL_PREFIX=${Ins_dest}"
 #-----------------------------------------------
 
@@ -19,7 +18,7 @@ FLAG="${FLAG} -DCMAKE_INSTALL_PREFIX=${Ins_dest}"
 #        Please follow the guide from official mkl/oneMKL "post-installation" part
 #        to source the proper setvars.sh and/or vars.sh
 #---------------------------
-FLAG="${FLAG} -DUSE_MKL=ON"
+FLAG="${FLAG} -DUSE_MKL=OFF"
 #---------------------------
 # 2-b) use OpenBLAS (DEFAULT = on (by settung DUSE_MKL=OFF above))
 # [Note] By default it will automatically find openblas installed
@@ -155,7 +154,7 @@ FLAG="${FLAG} -DUSE_OMP=OFF"
 #=========================================================
 # [Note] Wheather to run cytnx tests (DEFAULT = OFF)
 #-----------------------------------
-DRUN_TESTS=ON
+DRUN_TESTS=OFF
 FLAG="${FLAG} -DRUN_TESTS=${DRUN_TESTS}"
 #-----------------------------------
 
diff --git a/bm_tests/CMakeLists.txt b/bm_tests/CMakeLists.txt
index 70b4aacf6..8104dd680 100644
--- a/bm_tests/CMakeLists.txt
+++ b/bm_tests/CMakeLists.txt
@@ -13,6 +13,7 @@ add_executable(
   linalg/Directsum_bm.cpp
   linalg/Svd_bm.cpp
   linalg/Svd_truncate_bm.cpp
+  linalg/Lanczos_bm.cpp
 
 )
 if(USE_CUDA)
diff --git a/bm_tests/linalg/Lanczos_bm.cpp b/bm_tests/linalg/Lanczos_bm.cpp
new file mode 100644
index 000000000..98f5b3d2f
--- /dev/null
+++ b/bm_tests/linalg/Lanczos_bm.cpp
@@ -0,0 +1,138 @@
+#include <benchmark/benchmark.h>
+#include <cytnx.hpp>
+using namespace cytnx;
+
+namespace BMTest_Lanczos {
+  UniTensor CreateOneSiteEffHam(const int d, const int D, const unsigned int dypte = Type.Double,
+                                const int device = Device.cpu);
+  UniTensor CreateA(const int d, const int D, const unsigned int dtype = Type.Double,
+                    const int device = Device.cpu);
+  class OneSiteOp : public LinOp {
+   public:
+    OneSiteOp(const int d, const int D, const unsigned int dtype = Type.Double,
+              const int& device = Device.cpu)
+        : LinOp("mv", D * D, dtype, device) {
+      EffH = CreateOneSiteEffHam(d, D, dtype, device);
+    }
+    UniTensor EffH;
+
+    /*
+     *         |-|--"vil" "pi" "vir"--|-|                        |-|--"vil" "pi" "vir"--|-|
+     *         | |         +          | |             "po"       | |         +          | |
+     *         |L|- -------O----------|R|  dot         |       = |L|- -------O----------|R|
+     *         | |         +          | |       "vol"--A--"vor"  | |         +          | |
+     *         |_|--"vol" "po" "vor"--|_|                        |_|---------A----------|_|
+     *
+     * Then relabels ["vil", "pi", "vir"] -> ["vol", "po", "vor"]
+     *
+     * "vil":virtual in bond left
+     * "po":physical out bond
+     */
+    UniTensor matvec(const UniTensor& A) override {
+      auto tmp = Contract(EffH, A);
+      tmp.permute_({"vil", "pi", "vir"}, 1);
+      tmp.relabels_(A.labels());
+      return tmp;
+    }
+  };
+
+  // describe:test not supported UniTensor Type
+
+  /*
+   *     -1
+   *     |
+   *  0--A--2
+   */
+  UniTensor CreateA(const int d, const int D, const unsigned int dtype, const int device) {
+    double low = -1.0, high = 1.0;
+    UniTensor A = UniTensor({Bond(D), Bond(d), Bond(D)}, {}, -1, dtype, device)
+                    .set_name("A")
+                    .relabels_({"vol", "po", "vor"})
+                    .set_rowrank_(1);
+    if (Type.is_float(A.dtype())) {
+      random::uniform_(A, low, high, 0);
+    }
+    return A;
+  }
+
+  /*
+   *         |-|--"vil" "pi" "vir"--|-|
+   *         | |         +          | |
+   *         |L|- -------O----------|R|
+   *         | |         +          | |
+   *         |_|--"vol" "po" "vor"--|_|
+   */
+  UniTensor CreateOneSiteEffHam(const int d, const int D, const unsigned int dtype,
+                                const int device) {
+    double low = -1.0, high = 1.0;
+    std::vector<Bond> bonds = {Bond(D), Bond(d), Bond(D), Bond(D), Bond(d), Bond(D)};
+    std::vector<std::string> heff_labels = {"vil", "pi", "vir", "vol", "po", "vor"};
+    UniTensor HEff = UniTensor(bonds, {}, -1, dtype, device)
+                       .set_name("HEff")
+                       .relabels_(heff_labels)
+                       .set_rowrank(bonds.size() / 2);
+    auto HEff_shape = HEff.shape();
+    auto in_dim = 1;
+    for (int i = 0; i < HEff.rowrank(); ++i) {
+      in_dim *= HEff_shape[i];
+    }
+    auto out_dim = in_dim;
+    if (Type.is_float(HEff.dtype())) {
+      random::uniform_(HEff, low, high, 0);
+    }
+    auto HEff_mat = HEff.get_block();
+    HEff_mat.reshape_({in_dim, out_dim});
+    HEff_mat = HEff_mat + HEff_mat.permute({1, 0});  // symmtrize
+
+    // Let H can be converge in ExpM
+    auto eigs = HEff_mat.Eigh();
+    auto e = UniTensor(eigs[0], true) * 0.01;
+    e.set_labels({"a", "b"});
+    auto v = UniTensor(eigs[1]);
+    v.set_labels({"i", "a"});
+    auto vt = UniTensor(linalg::InvM(v.get_block()));
+    vt.set_labels({"b", "j"});
+    HEff_mat = Contract(Contract(e, v), vt).get_block();
+
+    // HEff_mat = linalg::Matmul(HEff_mat, HEff_mat.permute({1, 0}).Conj());  // positive definete
+    HEff_mat.reshape_(HEff_shape);
+    HEff.put_block(HEff_mat);
+    return HEff;
+  }
+
+  static void BM_Lanczos_Gnd_F64(benchmark::State& state) {
+    // prepare data
+    int d = 2;
+    auto D = state.range(0);
+    auto op = OneSiteOp(d, D);
+    auto Tin = CreateA(d, D);
+    const double crit = 1.0e+8;
+    const int maxiter = 2;
+    bool is_V = true;
+    int k = 1;
+    bool is_row = false;
+    int max_krydim = 0;
+    // start test here
+    for (auto _ : state) {
+      auto x = linalg::Lanczos(&op, Tin, "Gnd", crit, maxiter, k, is_V, is_row, max_krydim);
+    }
+  }
+  BENCHMARK(BM_Lanczos_Gnd_F64)->Args({10})->Args({30})->Unit(benchmark::kMillisecond);
+
+  static void BM_Lanczos_Exp_F64(benchmark::State& state) {
+    // prepare data
+    int d = 2;
+    auto D = state.range(0);
+    auto op = OneSiteOp(d, D);
+    auto Tin = CreateA(d, D);
+    const double crit = 1.0e+8;
+    double tau = 0.1;
+    const int maxiter = 2;
+    // start test here
+    for (auto _ : state) {
+      auto x = linalg::Lanczos_Exp(&op, Tin, tau, crit, maxiter);
+    }
+  }
+  BENCHMARK(BM_Lanczos_Exp_F64)->Args({10})->Args({30})->Unit(benchmark::kMillisecond);
+
+}  // namespace BMTest_Lanczos
diff --git a/cmake/Modules/FindCUQUANTUM.cmake b/cmake/Modules/FindCUQUANTUM.cmake
index b34c629a8..9df2b0811 100644
--- a/cmake/Modules/FindCUQUANTUM.cmake
+++ b/cmake/Modules/FindCUQUANTUM.cmake
@@ -27,19 +27,7 @@ else()
 endif()
 
 message(STATUS " cudaver: ${CUDAToolkit_VERSION_MAJOR}" )
-if((${CUDAToolkit_VERSION_MAJOR} LESS_EQUAL 10))
-  message(FATAL_ERROR "cuquantum requires CUDA ver.11+")
-elseif((${CUDAToolkit_VERSION_MAJOR} GREATER_EQUAL 11) AND (${CUDAToolkit_VERSION_MAJOR} LESS 12) AND  (${CUDAToolkit_VERSION_MINOR} LESS_EQUAL 0))
-  set(CUTNLIB_DIR "lib/11.0")
-elseif((${CUDAToolkit_VERSION_MAJOR} GREATER_EQUAL 11) AND (${CUDAToolkit_VERSION_MAJOR} LESS 12) AND (${CUDAToolkit_VERSION_MINOR} GREATER_EQUAL 1))
-  set(CUTNLIB_DIR "lib/11")
-elseif((${CUDAToolkit_VERSION_MAJOR} GREATER_EQUAL 12))
-  if(EXISTS "${CUQUANTUM_ROOT}/lib/12")
-    set(CUTNLIB_DIR "lib/12")
-  else()
-    set(CUTNLIB_DIR "lib")
-  endif()
-endif()
+set(CUTNLIB_DIR "lib")
 
 set(CUQUANTUM_LIBRARY_DIRS ${CUQUANTUM_ROOT}/${CUTNLIB_DIR})
 set(CUQUANTUM_INCLUDE_DIRS ${CUQUANTUM_ROOT}/include)
diff --git a/cytnx/UniTensor_conti.py b/cytnx/UniTensor_conti.py
index f8d8bdff3..f0c9f2eb5 100644
--- a/cytnx/UniTensor_conti.py
+++ b/cytnx/UniTensor_conti.py
@@ -211,6 +211,16 @@ def relabel_(self, idx:int, new_label:str):
 
 
 
+@add_ovld_method(UniTensor)
+def relabel_(self, old_labels:List[str],new_labels:List[str]):
+    self.c_relabel_(old_labels,new_labels);
+    return self
+
+@add_ovld_method(UniTensor)
+def relabel_(self, new_labels:List[str]):
+    self.c_relabel_(new_labels);
+    return self
+
 @add_ovld_method(UniTensor)
 def relabels_(self, old_labels:List[str],new_labels:List[str]):
     self.c_relabels_(old_labels,new_labels);
diff --git a/example/TDVP/tdvp1_dense.py b/example/TDVP/tdvp1_dense.py
new file mode 100644
index 000000000..ac6b7ac31
--- /dev/null
+++ b/example/TDVP/tdvp1_dense.py
@@ -0,0 +1,313 @@
+import sys,os
+import numpy as np
+import cytnx
+
+"""
+Author: Hao-Ti Hung
+"""
+
+def tdvp1_XXZmodel_dense(J, Jz, hx, hz, A, chi, dt, time_step):
+
+    class OneSiteOp(cytnx.LinOp):
+        def __init__(self, L, M, R):
+            self.anet = cytnx.Network()
+            d = L.shape()[0]
+            D1 = L.shape()[2]
+            D2 = R.shape()[2]
+            cytnx.LinOp.__init__(self, "mv", D1*D2*d, L.dtype(), R.device())
+            self.anet.FromString([\
+                        "psi: -1,-2,-3",\
+                        "L: -4,-1,0",\
+                        "R: -6,-3,2",\
+                        "M: -4,-6,-2,1",\
+                        "TOUT: 0,1;2"])
+            self.anet.PutUniTensors(["L","M","R"],[L,M,R])
+        def matvec(self, v):
+            self.anet.PutUniTensor("psi",v)
+            out = self.anet.Launch()
+            out.relabels_(v.labels())
+            return out
+
+    class ZeroSiteOp(cytnx.LinOp):
+        def __init__(self, L, R):
+            self.anet = cytnx.Network()
+            d = L.shape()[0]
+            D1 = L.shape()[2]
+            D2 = R.shape()[2]
+            cytnx.LinOp.__init__(self, "mv", D1*D2, L.dtype(), R.device())
+            self.anet.FromString([\
+                        "C: -1,-2",\
+                        "L: -3,-1,0",\
+                        "R: -3,-2,1",\
+                        "TOUT: 0;1"])
+            self.anet.PutUniTensors(["L","R"],[L,R])
+        def matvec(self, v):
+            self.anet.PutUniTensor("C",v)
+            out = self.anet.Launch()
+            out.relabels_(v.labels())
+            return out
+
+    def time_evolve_Lan_f(psi, functArgs, delta):
+        L,M,R = functArgs
+        L = L.astype(cytnx.Type.ComplexDouble)
+        M = M.astype(cytnx.Type.ComplexDouble)
+        R = R.astype(cytnx.Type.ComplexDouble)
+        op = OneSiteOp(L,M,R)
+        exp_iH_v = cytnx.linalg.Lanczos_Exp(op, psi, -1.0j*delta*0.5, 1.0e-8)
+        exp_iH_v.relabels_(psi.labels())
+        return exp_iH_v
+
+    def time_evolve_Lan_b(psi, functArgs, delta):
+        L,R = functArgs
+        L = L.astype(cytnx.Type.ComplexDouble)
+        R = R.astype(cytnx.Type.ComplexDouble)
+        op = ZeroSiteOp(L,R)
+        exp_iH_v = cytnx.linalg.Lanczos_Exp(op, psi, 1.0j*delta*0.5, 1.0e-8)
+        exp_iH_v.relabels_(psi.labels())
+        return exp_iH_v
+
+    def get_energy(A, M):
+        N = len(A)
+        L0 = cytnx.UniTensor(cytnx.zeros([5,1,1]), rowrank = 0) #Left boundary
+        R0 = cytnx.UniTensor(cytnx.zeros([5,1,1]), rowrank = 0) #Right boundary
+        L0[0,0,0] = 1.; R0[4,0,0] = 1.
+        L = L0
+        anet = cytnx.Network()
+        anet.FromString(["L: -2,-1,-3",\
+                        "A: -1,-4,1",\
+                        "M: -2,0,-4,-5",\
+                        "A_Conj: -3,-5,2",\
+                        "TOUT: 0,1,2"])
+        # or you can do: anet = cytnx.Network("L_AMAH.net")
+        for p in range(0, N):
+            anet.PutUniTensors(["L","A","A_Conj","M"],[L,A[p],A[p].Conj(),M])
+            L = anet.Launch()
+        E = cytnx.Contract(L, R0).item()
+        print('energy:', E)
+        return E
+
+
+    d = 2 #physical dimension
+    sx = cytnx.physics.pauli('x').real()
+    sy = cytnx.physics.pauli('y')
+    sz = cytnx.physics.pauli('z').real()
+    sp = (sx+1j*sy).real()
+    sm = (sx-1j*sy).real()
+
+    eye = cytnx.eye(d)
+    M = cytnx.zeros([5, 5, d, d])
+    M[0,0] = M[4,4] = eye
+    M[0,4] = hx*sx + hz*sz
+    M[1,4] = sp
+    M[2,4] = sm
+    M[3,4] = sz
+    M[0,1] = 0.5*J*sm
+    M[0,2] = 0.5*J*sp
+    M[0,3] = Jz*sz
+    M = cytnx.UniTensor(M,0)
+
+    L0 = cytnx.UniTensor(cytnx.zeros([5,1,1]), rowrank = 0) #Left boundary
+    R0 = cytnx.UniTensor(cytnx.zeros([5,1,1]), rowrank = 0) #Right boundary
+    L0[0,0,0] = 1.; R0[4,0,0] = 1.
+
+    lbls = [] # List for storing the MPS labels
+    Nsites = len(A)
+    lbls.append(["0","1","2"]) # store the labels for later convinience.
+
+    for k in range(1,Nsites):
+        lbl = [str(2*k),str(2*k+1),str(2*k+2)]
+        A[k].relabels_(lbl)
+        lbls.append(lbl) # store the labels for later convinience.
+
+    LR = [None for i in range(Nsites+1)]
+    LR[0]  = L0
+    LR[-1] = R0
+
+
+    for p in range(Nsites - 1):
+
+        ## Changing to canonical form site by site:
+        s, A[p] ,vt = cytnx.linalg.Gesvd(A[p])
+        A[p+1] = cytnx.Contract(cytnx.Contract(s,vt),A[p+1])
+
+        ## Calculate enviroments:
+        anet = cytnx.Network()
+        anet.FromString(["L: -2,-1,-3",\
+                        "A: -1,-4,1",\
+                        "M: -2,0,-4,-5",\
+                        "A_Conj: -3,-5,2",\
+                        "TOUT: 0,1,2"])
+        # or you can do: anet = cytnx.Network("L_AMAH.net")
+        anet.PutUniTensors(["L","A","A_Conj","M"],[LR[p],A[p],A[p].Conj(),M])
+        LR[p+1] = anet.Launch()
+
+        # Recover the original MPS labels
+        A[p].relabels_(lbls[p])
+        A[p+1].relabels_(lbls[p+1])
+
+    As = []
+    As.append(A.copy())
+    Es = []
+
+
+    for k in range(1, time_step+1):
+        print('time:', k)
+        E = get_energy(A, M)
+        Es.append(E)
+
+        for p in range(Nsites-1,-1,-1):
+            dim_l = A[p].shape()[0]
+            dim_r = A[p].shape()[2]
+            new_dim = min(dim_l*d,dim_r*d,chi)
+
+
+            psi = A[p].clone()
+            psi = time_evolve_Lan_f(psi, (LR[p],M,LR[p+1]), dt)
+
+            psi.set_rowrank_(1) # maintain rowrank to perform the svd
+            s,_,A[p] = cytnx.linalg.Svd_truncate(psi,new_dim)
+            A[p].relabels_(lbls[p]); # set the label back to be consistent
+            # update LR from right to left:
+            anet = cytnx.Network()
+            anet.FromString(["R: -2,-1,-3",\
+                            "B: 1,-4,-1",\
+                            "M: 0,-2,-4,-5",\
+                            "B_Conj: 2,-5,-3",\
+                            "TOUT: ;0,1,2"])
+            # or you can do: anet = cytnx.Network("R_AMAH.net")
+            anet.PutUniTensors(["R","B","M","B_Conj"],[LR[p+1],A[p],M,A[p].Conj()])
+            old_LR = LR[p].clone()
+            if p != 0:
+                LR[p] = anet.Launch()
+                s = s/s.Norm().item() # normalize s
+                C = cytnx.Contract(_, s)
+                #C = time_evolve_b(C, (old_LR, LR[p]), dt)
+                C = time_evolve_Lan_b(C, (old_LR, LR[p]), dt)
+
+                A[p-1] = cytnx.Contract(A[p-1], C).relabels_(A[p-1].labels())
+
+
+
+
+            print('Sweep[r->l]: %d/%d, Loc: %d' % (k, time_step, p))
+
+        A[0].set_rowrank_(1)
+        _,A[0] = cytnx.linalg.Gesvd(A[0],is_U=False, is_vT=True)
+        A[0].relabels_(lbls[0]); #set the label back to be consistent
+
+
+        for p in range(Nsites):
+            dim_l = A[p].shape()[0]
+            dim_r = A[p].shape()[2]
+            new_dim = min(dim_l*d,dim_r*d,chi)
+
+            psi = A[p]
+            psi = time_evolve_Lan_f(psi, (LR[p],M,LR[p+1]), dt)
+
+            psi.set_rowrank_(2) # maintain rowrank to perform the svd
+            s,A[p],_ = cytnx.linalg.Svd_truncate(psi,new_dim)
+            A[p].relabels_(lbls[p]); #set the label back to be consistent
+            # update LR from left to right:
+            anet = cytnx.Network()
+            anet.FromString(["L: -2,-1,-3",\
+                            "A: -1,-4,1",\
+                            "M: -2,0,-4,-5",\
+                            "A_Conj: -3,-5,2",\
+                            "TOUT: 0,1,2"])
+
+            anet.PutUniTensors(["L","A","A_Conj","M"],[LR[p],A[p],A[p].Conj(),M])
+            old_LR = LR[p+1].clone()
+
+
+            if p != Nsites - 1:
+                LR[p+1] = anet.Launch()
+                s = s/s.Norm().item() # normalize s
+                C = cytnx.Contract(s, _)
+                C = time_evolve_Lan_b(C, (LR[p+1],old_LR), dt)
+                A[p+1] = cytnx.Contract(A[p+1], C)
+                A[p+1].permute_(['_aux_L', lbls[p+1][1], lbls[p+1][2]])
+                A[p+1].relabels_(lbls[p+1])
+
+            print('Sweep[l->r]: %d/%d, Loc: %d' % (k, time_step, p))
+
+        A[-1].set_rowrank_(2)
+        _,A[-1] = cytnx.linalg.Gesvd(A[-1],is_U=True,is_vT=False) ## last one.
+        A[-1].relabels_(lbls[-1]); #set the label back to be consistent
+        As.append(A.copy())
+    return As, Es # all time step states
+
+def Local_meas(A, B, Op, site):
+    N = len(A)
+    l = cytnx.UniTensor(cytnx.eye(1), rowrank = 1)
+    anet = cytnx.Network()
+    anet.FromString(["l: 0,3",\
+                    "A: 0,1,2",\
+                    "B: 3,1,4",\
+                    "TOUT: 2;4"])
+    for i in range(0, N):
+        if i != site:
+            anet.PutUniTensors(["l","A","B"],[l,A[i],B[i].Conj()])
+            l = anet.Launch()
+        else:
+            tmp = A[i].relabel(1, "_aux_up")
+            Op = Op.relabels(["_aux_up", "_aux_low"])
+            tmp = cytnx.Contract(tmp, Op)
+            tmp.relabel_("_aux_low", A[i].labels()[1])
+            tmp.permute_(A[i].labels())
+            anet.PutUniTensors(["l","A","B"],[l,tmp,B[i].Conj()])
+            l = anet.Launch()
+
+    return l.reshape(1).item()
+
+
+def prepare_rand_init_MPS(Nsites, chi, d):
+    lbls = []
+    A = [None for i in range(Nsites)]
+    A[0] = cytnx.UniTensor(cytnx.random.normal([1, d, min(chi, d)], 0., 1.), rowrank = 2)
+    A[0].relabels_(["0","1","2"])
+    lbls.append(["0","1","2"]) # store the labels for later convinience.
+
+    for k in range(1,Nsites):
+        dim1 = A[k-1].shape()[2]; dim2 = d
+        dim3 = min(min(chi, A[k-1].shape()[2] * d), d ** (Nsites - k - 1))
+        A[k] = cytnx.UniTensor(cytnx.random.normal([dim1, dim2, dim3],0.,1.), rowrank = 2)
+
+        lbl = [str(2*k),str(2*k+1),str(2*k+2)]
+        A[k].relabels_(lbl)
+        lbls.append(lbl) # store the labels for later convinience.
+    return A
+
+
+if __name__ == '__main__':
+    #prepare random MPS
+    Nsites = 7 # Number of sites
+    chi = 8 # MPS bond dimension
+    d = 2
+    MPS_rand = prepare_rand_init_MPS(Nsites, chi, d)
+
+    # simulate ground state by imaginary time evolution by tdvp
+    J = 0.0
+    Jz = 0.0
+    hx = 0.0
+    hz = -1.0
+    tau = -1.0j
+    time_step = 10
+    # prepare up state
+    As, Es = tdvp1_XXZmodel_dense(J, Jz, hx, hz, MPS_rand, chi, tau, time_step)
+    GS = As[time_step - 1]
+
+    # real tiem evoolution
+    J = 0.0
+    Jz = 0.5
+    hx = 0.3
+    hz = 3.0
+    dt = 0.1
+    time_step = 25 # number of DMRG sweeps
+    As, Es = tdvp1_XXZmodel_dense(J, Jz, hx, hz, GS, chi, dt, time_step)
+
+    # measure middle site <Sz>
+    Sz = cytnx.UniTensor(cytnx.physics.pauli('z').real(), rowrank = 1)
+    Szs = []
+    mid_site = int(Nsites/2)
+    for i in range(0, len(As)):
+        Szs.append(Local_meas(As[i], As[i], Sz, mid_site).real)
diff --git a/include/Bond.hpp b/include/Bond.hpp
index d559b5795..8f9909ef5 100644
--- a/include/Bond.hpp
+++ b/include/Bond.hpp
@@ -556,6 +556,74 @@ namespace cytnx {
 
     /**
     @brief combine multiple input bonds with self, and return a new combined Bond instance.
+    @param[in] bds the bonds that to be combined with self.
+          @param[in] is_grp this parameter is only used when the bond is
+      symmetric bond (bondType.BD_BRA or bondType.BD_KET).
+      If is_grp is true, the basis with duplicated quantum number will be
+      grouped together as a single basis. See
+      group_duplicates(std::vector<cytnx_uint64> &mapper) const.
+    @return [Bond] a new combined bond instance.
+          @pre
+            1. The type of all bonds (see cytnx::bondType) need to be same.
+            2. The Symmetry of all bonds should be same.
+          @note Compare to \n
+      combineBond_(const std::vector<Bond> &bds, const bool &is_grp),\n
+            this function will create a new Bond object.
+    @see combineBond_(const std::vector<Bond> &bds, const bool &is_grp)
+
+    ## Example:
+    ### c++ API:
+    \include example/Bond/combineBond.cpp
+    #### output>
+    \verbinclude example/Bond/combineBond.cpp.out
+    ### python API:
+    \include example/Bond/combineBond.py
+    #### output>
+    \verbinclude example/Bond/combineBond.py.out
+    */
+    Bond combineBond(const std::vector<Bond> &bds, const bool &is_grp = true) {
+      Bond out = this->clone();
+      for (cytnx_uint64 i = 0; i < bds.size(); i++) {
+        out.combineBond_(bds[i], is_grp);
+      }
+      return out;
+    }
+
+    /**
+    @brief combine multiple input bonds with self, inplacely
+    @param[in] bds the bonds that to be combined with self.
+          @param[in] is_grp this parameter is only used when the bond is
+      symmetric bond (bondType.BD_BRA or bondType.BD_KET).
+      If is_grp is true, the basis with duplicated quantum number will be
+      grouped together as a single basis. See
+      group_duplicates(std::vector<cytnx_uint64> &mapper) const.
+        @pre
+          1. The type of all bonds (see cytnx::bondType) need to be same.
+          2. The Symmetry of all bonds should be same.
+        @note Compare to \n
+      combineBond(const std::vector<Bond> &bds, const bool &is_grp),\n
+          this function will create a new Bond object.
+    @see combineBond(const std::vector<Bond> &bds, const bool &is_grp)
+
+    ## Example:
+    ### c++ API:
+    \include example/Bond/combineBond_.cpp
+    #### output>
+    \verbinclude example/Bond/combineBond_.cpp.out
+    ### python API:
+    \include example/Bond/combineBond_.py
+    #### output>
+    \verbinclude example/Bond/combineBond_.py.out
+    */
+    void combineBond_(const std::vector<Bond> &bds, const bool &is_grp = true) {
+      for (cytnx_uint64 i = 0; i < bds.size(); i++) {
+        this->combineBond_(bds[i], is_grp);
+      }
+    }
+
+    /**
+    @deprecated
+    @brief combine multiple input bonds with self, and return a new combined Bond instance.
     @param[in] bds the bonds that to be combined with self.
           @param[in] is_grp this parameter is only used when the bond is
       symmetric bond (bondType.BD_BRA or bondType.BD_KET).
@@ -590,6 +658,7 @@ namespace cytnx {
     }
 
     /**
+    @deprecated
     @brief combine multiple input bonds with self, inplacely
     @param[in] bds the bonds that to be combined with self.
           @param[in] is_grp this parameter is only used when the bond is
diff --git a/include/UniTensor.hpp b/include/UniTensor.hpp
index a8475a0e4..08a945370 100644
--- a/include/UniTensor.hpp
+++ b/include/UniTensor.hpp
@@ -199,7 +199,10 @@ namespace cytnx {
     }
 
     void set_labels(const std::vector<std::string> &new_labels);
+    void relabel_(const std::vector<std::string> &new_labels);  // implemented
     void relabels_(const std::vector<std::string> &new_labels);  // implemented
+    void relabel_(const std::vector<std::string> &old_labels,
+                  const std::vector<std::string> &new_labels);  // implemented
     void relabels_(const std::vector<std::string> &old_labels,
                    const std::vector<std::string> &new_labels);  // implemented
     void relabel_(const std::string &old_label, const std::string &new_label) {
@@ -293,6 +296,7 @@ namespace cytnx {
                                                          const cytnx_uint64 &rowrank = 0);
     virtual boost::intrusive_ptr<UniTensor_base> to_dense();
     virtual void to_dense_();
+    virtual void combineBond(const std::vector<std::string> &indicators, const bool &force = false);
     virtual void combineBonds(const std::vector<cytnx_int64> &indicators, const bool &force,
                               const bool &by_label);
     virtual void combineBonds(const std::vector<std::string> &indicators,
@@ -310,9 +314,13 @@ namespace cytnx {
     virtual boost::intrusive_ptr<UniTensor_base> Trace(const std::string &a, const std::string &b);
     virtual boost::intrusive_ptr<UniTensor_base> Trace(const cytnx_int64 &a, const cytnx_int64 &b);
 
+    virtual boost::intrusive_ptr<UniTensor_base> relabel(
+      const std::vector<std::string> &new_labels);
     virtual boost::intrusive_ptr<UniTensor_base> relabels(
       const std::vector<std::string> &new_labels);
 
+    virtual boost::intrusive_ptr<UniTensor_base> relabel(
+      const std::vector<std::string> &old_labels, const std::vector<std::string> &new_labels);
     virtual boost::intrusive_ptr<UniTensor_base> relabels(
       const std::vector<std::string> &old_labels, const std::vector<std::string> &new_labels);
 
@@ -532,8 +540,11 @@ namespace cytnx {
     void permute_(const std::vector<cytnx_int64> &mapper, const cytnx_int64 &rowrank = -1);
     void permute_(const std::vector<std::string> &mapper, const cytnx_int64 &rowrank = -1);
 
+    boost::intrusive_ptr<UniTensor_base> relabel(const std::vector<std::string> &new_labels);
     boost::intrusive_ptr<UniTensor_base> relabels(const std::vector<std::string> &new_labels);
 
+    boost::intrusive_ptr<UniTensor_base> relabel(const std::vector<std::string> &old_labels,
+                                                 const std::vector<std::string> &new_labels);
     boost::intrusive_ptr<UniTensor_base> relabels(const std::vector<std::string> &old_labels,
                                                   const std::vector<std::string> &new_labels);
 
@@ -722,6 +733,7 @@ namespace cytnx {
      * @param permute_back
      * @param by_label
      */
+    void combineBond(const std::vector<std::string> &indicators, const bool &force = true);
     void combineBonds(const std::vector<cytnx_int64> &indicators, const bool &force,
                       const bool &by_label);
     void combineBonds(const std::vector<std::string> &indicators, const bool &force = true);
@@ -1365,8 +1377,11 @@ namespace cytnx {
                                                   const bool &mv_elem_self = false,
                                                   const bool &mv_elem_rhs = false);
 
+    boost::intrusive_ptr<UniTensor_base> relabel(const std::vector<std::string> &new_labels);
     boost::intrusive_ptr<UniTensor_base> relabels(const std::vector<std::string> &new_labels);
 
+    boost::intrusive_ptr<UniTensor_base> relabel(const std::vector<std::string> &old_labels,
+                                                 const std::vector<std::string> &new_labels);
     boost::intrusive_ptr<UniTensor_base> relabels(const std::vector<std::string> &old_labels,
                                                   const std::vector<std::string> &new_labels);
 
@@ -1704,6 +1719,7 @@ namespace cytnx {
 
     void group_basis_();
 
+    void combineBond(const std::vector<std::string> &indicators, const bool &force = false);
     void combineBonds(const std::vector<cytnx_int64> &indicators, const bool &force = false);
     void combineBonds(const std::vector<cytnx_int64> &indicators, const bool &force,
                       const bool &by_label);
@@ -2307,6 +2323,20 @@ namespace cytnx {
     /**
     @brief Set new labels for all the bonds.
     @param[in] new_labels the new labels for each bond.
+    @note
+        1. the new assign label cannot be the same as the label of any other bonds in the
+    UniTensor. ( cannot have duplicate labels )
+        2. Compare to relabel(const std::vector<std::string> &new_labels) const, this
+        function set the new label to itself.
+    */
+    UniTensor &relabel_(const std::vector<std::string> &new_labels) {
+      this->_impl->relabel_(new_labels);
+      return *this;
+    }
+    /**
+    @deprecated
+    @brief Set new labels for all the bonds.
+    @param[in] new_labels the new labels for each bond.
     @note
         1. the new assign label cannot be the same as the label of any other bonds in the
     UniTensor. ( cannot have duplicate labels )
@@ -2319,6 +2349,23 @@ namespace cytnx {
     }
 
     /**
+    @brief relable all of the labels in UniTensor.
+    @param[in] new_labels the new labels for each bond.
+    @note
+        1. the new assign label cannot be the same as the label of any other bonds in the
+    UniTensor. ( cannot have duplicate labels )
+
+    @attention This function will return a new UniTensor with the new label, but the data is
+    still shared with the original UniTensor. That is the meta data of the UniTensor is
+    different, but the internal data is still shared.
+     */
+    UniTensor relabel(const std::vector<std::string> &new_labels) const {
+      UniTensor out;
+      out._impl = this->_impl->relabel(new_labels);
+      return out;
+    }
+    /**
+    @deprecated
     @brief relables all of the labels in UniTensor.
     @param[in] new_labels the new labels for each bond.
     @note
@@ -2336,6 +2383,20 @@ namespace cytnx {
     }
 
     /**
+    @see relabel(const std::vector<std::string> &new_labels) const
+     */
+    UniTensor relabel(const std::initializer_list<char *> &new_labels) const {
+      std::vector<char *> new_lbls(new_labels);
+      std::vector<std::string> vs(new_lbls.size());
+      transform(new_lbls.begin(), new_lbls.end(), vs.begin(),
+                [](char *x) -> std::string { return std::string(x); });
+
+      UniTensor out;
+      out._impl = this->_impl->relabel(vs);
+      return out;
+    }
+    /**
+    @deprecated
     @see relabels(const std::vector<std::string> &new_labels) const
      */
     UniTensor relabels(const std::initializer_list<char *> &new_labels) const {
@@ -2349,6 +2410,19 @@ namespace cytnx {
       return out;
     }
     /**
+    @see relabel_(const std::vector<std::string> &new_labels)
+     */
+    UniTensor &relabel_(const std::initializer_list<char *> &new_labels) {
+      std::vector<char *> new_lbls(new_labels);
+      std::vector<std::string> vs(new_lbls.size());
+      transform(new_lbls.begin(), new_lbls.end(), vs.begin(),
+                [](char *x) -> std::string { return std::string(x); });
+
+      this->_impl->relabel_(vs);
+      return *this;
+    }
+    /**
+    @deprecated
     @see relabels_(const std::vector<std::string> &new_labels)
      */
     UniTensor &relabels_(const std::initializer_list<char *> &new_labels) {
@@ -2368,6 +2442,20 @@ namespace cytnx {
     @note
         1. the final output UniTensor cannot have duplicate labels.
     */
+    UniTensor relabel(const std::vector<std::string> &old_labels,
+                      const std::vector<std::string> &new_labels) const {
+      UniTensor out;
+      out._impl = this->_impl->relabel(old_labels, new_labels);
+      return out;
+    }
+    /**
+    @deprecated
+    @brief replace part or all labels by given new labels for the bonds.
+    @param[in] old_labels the old labels for each bond.
+    @param[in] new_labels the new labels for each bond.
+    @note
+        1. the final output UniTensor cannot have duplicate labels.
+    */
     UniTensor relabels(const std::vector<std::string> &old_labels,
                        const std::vector<std::string> &new_labels) const {
       UniTensor out;
@@ -2376,6 +2464,27 @@ namespace cytnx {
     }
 
     /**
+    @brief relable part or all of the labels in UniTensor by given new labels
+    @param[in] old_labels the old labels for each bond.
+    @param[in] new_labels the new labels for each bond.
+    @note
+        1. the final output UniTensor cannot have duplicate labels.
+        2. Compare to relabel(const std::vector<std::string> &old_labels,  const
+    std::vector<std::string> &new_labels) const , this function set the new label(s) to itself.
+
+    @see relabel(const std::vector<std::string> &old_labels, const std::vector<std::string>
+    &new_labels) const
+    @attention This function will return a new UniTensor with the new labels, but the data is
+    still shared with the original UniTensor. That is the meta data of the UniTensor is
+    different, but the internal data is still shared.
+     */
+    UniTensor &relabel_(const std::vector<std::string> &old_labels,
+                        const std::vector<std::string> &new_labels) {
+      this->_impl->relabel_(old_labels, new_labels);
+      return *this;
+    }
+    /**
+    @deprecated
     @brief relables part or all of the labels in UniTensor by given new labels
     @param[in] old_labels the old labels for each bond.
     @param[in] new_labels the new labels for each bond.
@@ -2397,6 +2506,26 @@ namespace cytnx {
     }
 
     /**
+    @see relabel(const std::vector<std::string> &old_labels, const std::vector<std::string>
+    &new_labels) const
+     */
+    UniTensor relabel(const std::initializer_list<char *> &old_labels,
+                      const std::initializer_list<char *> &new_labels) const {
+      std::vector<char *> new_lbls(new_labels);
+      std::vector<std::string> vs(new_lbls.size());
+      transform(new_lbls.begin(), new_lbls.end(), vs.begin(),
+                [](char *x) -> std::string { return std::string(x); });
+
+      std::vector<char *> old_lbls(old_labels);
+      std::vector<std::string> vs_old(old_lbls.size());
+      transform(old_lbls.begin(), old_lbls.end(), vs_old.begin(),
+                [](char *x) -> std::string { return std::string(x); });
+
+      return this->relabel(vs_old, vs);
+    }
+
+    /**
+    @deprecated
     @see relabels(const std::vector<std::string> &old_labels, const std::vector<std::string>
     &new_labels) const
      */
@@ -2416,6 +2545,26 @@ namespace cytnx {
     }
 
     /**
+    @see relabel_(const std::vector<std::string> &old_labels, const std::vector<std::string>
+    &new_labels)
+     */
+    UniTensor &relabel_(const std::initializer_list<char *> &old_labels,
+                        const std::initializer_list<char *> &new_labels) {
+      std::vector<char *> new_lbls(new_labels);
+      std::vector<std::string> vs(new_lbls.size());
+      transform(new_lbls.begin(), new_lbls.end(), vs.begin(),
+                [](char *x) -> std::string { return std::string(x); });
+
+      std::vector<char *> old_lbls(old_labels);
+      std::vector<std::string> vs_old(old_lbls.size());
+      transform(old_lbls.begin(), old_lbls.end(), vs_old.begin(),
+                [](char *x) -> std::string { return std::string(x); });
+
+      this->relabel_(vs_old, vs);
+      return *this;
+    }
+    /**
+    @deprecated
     @see relabels_(const std::vector<std::string> &old_labels, const std::vector<std::string>
     &new_labels)
      */
@@ -3213,7 +3362,7 @@ namespace cytnx {
 
     /**
      * @deprecated This function is deprecated. Please use \n
-     *   combineBonds(const std::vector<std::string> &indicators, const bool &force) \n
+     *   combineBond(const std::vector<std::string> &indicators, const bool &force) \n
      *   instead.
      */
     void combineBonds(const std::vector<cytnx_int64> &indicators, const bool &force,
@@ -3222,6 +3371,7 @@ namespace cytnx {
     }
 
     /**
+    @deprecated
     @brief Combine the sevral bonds of the UniTensor.
         @param[in] indicators the labels of the lags you want to combine.
         @param[in] force If force is true, it will combine the bonds anyway even the direction
@@ -3237,13 +3387,27 @@ namespace cytnx {
 
     /**
      * @deprecated This function is deprecated. Please use \n
-     *   combineBonds(const std::vector<std::string> &indicators, const bool &force) \n
+     *   combineBond(const std::vector<std::string> &indicators, const bool &force) \n
      *   instead.
      */
     void combineBonds(const std::vector<cytnx_int64> &indicators, const bool &force = false) {
       this->_impl->combineBonds(indicators, force);
     }
 
+    /**
+    @brief Combine the sevral bonds of the UniTensor.
+        @param[in] indicators the labels of the lags you want to combine.
+        @param[in] force If force is true, it will combine the bonds anyway even the direction
+      of the bonds are same. After combining, the direction of the bonds will be set as
+      same as the first bond.
+        @pre
+            1. The size of \p indicators need to >= 2.
+            2. The UniTensor cannot be diagonal form (that means is_diag cannot be true.)
+        */
+    void combineBond(const std::vector<std::string> &indicators, const bool &force = false) {
+      this->_impl->combineBond(indicators, force);
+    }
+
     /**
     @brief Contract the UniTensor with common labels.
           @details This function contract the UniTensor lags with common labels.
@@ -4454,6 +4618,22 @@ namespace cytnx {
 
   See also \link cytnx::UniTensor::contract UniTensor.contract \endlink
 
+  */
+  UniTensor Contract(const std::vector<UniTensor> &TNs, const std::string &order,
+                     const bool &optimal);
+
+  /**
+  @deprecated
+  @brief Contract multiple UniTensor by tracing the ranks with common labels with pairwise
+  operation.
+  @param[in] TNs the Tensors.
+  @param[in] order desired contraction order.
+  @param[in] optimal wheather to find the optimal contraction order automatically.
+  @return
+      [UniTensor]
+
+  See also \link cytnx::UniTensor::contract UniTensor.contract \endlink
+
   */
   UniTensor Contracts(const std::vector<UniTensor> &TNs, const std::string &order,
                       const bool &optimal);
@@ -4477,6 +4657,26 @@ namespace cytnx {
 
   See also \link cytnx::UniTensor::contract UniTensor.contract \endlink
 
+  */
+  template <class... T>
+  UniTensor Contract(const UniTensor &in, const T &...args, const std::string &order,
+                     const bool &optimal) {
+    std::vector<UniTensor> TNlist;
+    _resolve_CT(TNlist, in, args...);
+    return Contract(TNlist, order, optimal);
+  }
+
+  /**
+  @deprecated
+  @brief Contract multiple UniTensor by tracing the ranks with common labels with pairwise
+  operation.
+  @param in the Tensors.
+  @param args the Tensors.
+  @return
+      [UniTensor]
+
+  See also \link cytnx::UniTensor::contract UniTensor.contract \endlink
+
   */
   template <class... T>
   UniTensor Contracts(const UniTensor &in, const T &...args, const std::string &order,
diff --git a/include/linalg.hpp b/include/linalg.hpp
index e75d3f283..9cd1b736d 100644
--- a/include/linalg.hpp
+++ b/include/linalg.hpp
@@ -1,19 +1,20 @@
 #ifndef _linalg_H_
 #define _linalg_H_
 
-#include "Type.hpp"
-#include "cytnx_error.hpp"
+#include "LinOp.hpp"
 #include "Tensor.hpp"
+#include "Type.hpp"
 #include "UniTensor.hpp"
-#include "LinOp.hpp"
+#include "cytnx_error.hpp"
 
 #ifdef BACKEND_TORCH
 #else
 
-  #include "backend/Storage.hpp"
-  #include "backend/Scalar.hpp"
   #include <functional>
 
+  #include "backend/Scalar.hpp"
+  #include "backend/Storage.hpp"
+
 namespace cytnx {
   int set_mkl_ilp64();
   int get_mkl_code();
@@ -721,7 +722,8 @@ namespace cytnx {
     std::vector<cytnx::UniTensor> Svd_truncate(const cytnx::UniTensor &Tin,
                                                const cytnx_uint64 &keepdim, const double &err = 0,
                                                const bool &is_UvT = true,
-                                               const unsigned int &return_err = 0);
+                                               const unsigned int &return_err = 0,
+                                               const unsigned int &mindim = 0);
 
     /**
      * @brief Perform Singular-Value decomposition on a UniTensor with truncation.
@@ -736,7 +738,8 @@ namespace cytnx {
     std::vector<cytnx::UniTensor> Gesvd_truncate(const cytnx::UniTensor &Tin,
                                                  const cytnx_uint64 &keepdim, const double &err = 0,
                                                  const bool &is_U = true, const bool &is_vT = true,
-                                                 const unsigned int &return_err = 0);
+                                                 const unsigned int &return_err = 0,
+                                                 const unsigned int &mindim = 0);
 
     std::vector<cytnx::UniTensor> Hosvd(
       const cytnx::UniTensor &Tin, const std::vector<cytnx_uint64> &mode,
@@ -1559,7 +1562,8 @@ namespace cytnx {
     */
     std::vector<Tensor> Svd_truncate(const Tensor &Tin, const cytnx_uint64 &keepdim,
                                      const double &err = 0, const bool &is_UvT = true,
-                                     const unsigned int &return_err = 0);
+                                     const unsigned int &return_err = 0,
+                                     const unsigned int &mindim = 0);
 
     // Gesvd_truncate:
     //==================================================
@@ -1598,8 +1602,8 @@ namespace cytnx {
     */
     std::vector<Tensor> Gesvd_truncate(const Tensor &Tin, const cytnx_uint64 &keepdim,
                                        const double &err = 0, const bool &is_U = true,
-                                       const bool &is_vT = true,
-                                       const unsigned int &return_err = 0);
+                                       const bool &is_vT = true, const unsigned int &return_err = 0,
+                                       const unsigned int &mindim = 0);
 
     // Hosvd:
     std::vector<Tensor> Hosvd(
@@ -2234,6 +2238,46 @@ namespace cytnx {
                                 const cytnx_double &cvg_crit = 1.0e-9, const cytnx_uint64 &k = 1,
                                 const bool &is_V = true, const bool &verbose = false);
 
+    // Arnoldi:
+    //===========================================
+    /**
+    @brief perform Arnoldi for matrices or linear function.
+    @details This function calculate the eigen value problem using Arnoldi algorithm.
+    @param[in] Hop the Linear Operator defined by LinOp class or it's inheritance (see LinOp).
+    @param[in] Tin the initial UniTensor.
+    @param[in] which
+    @parblock
+    which order eigenvlues and corresponding eigenvectors should be find, the supported
+    options are:
+
+    <b>'LM'</b> : largest magnitude
+    <b>'LR'</b> : largest real part
+    <b>'LI'</b> : largest imaginary part
+    <b>'SR'</b> : smallest real part
+    <b>'SI'</b> : smallest imaginary part
+
+    @endparblock
+    @param[in] maxiter the maximum interation steps for each k.
+    @param[in] cvg_crit the convergence criterion of the energy.
+    @param[in] k the number of lowest k eigen values.
+    @param[in] is_V if set to true, the eigen vectors will be returned.
+    @param[in] verbose print out iteration info.
+    @return
+        [eigvals (UniTensor), eigvec_1, eivec_2, ..., eigvec_k].
+                The first UniTensor contains eigenvalues.
+    @note
+        To use, define a linear operator with LinOp class either by assign a custom function or
+    create a class that inherit LinOp (see LinOp for further details)
+
+        @pre
+        1. The initial UniTensor cannot be empty.
+        2. The UniTensor version of the Arnoldi not support \p which = 'SM'.
+    */
+    std::vector<UniTensor> Arnoldi(LinOp *Hop, const UniTensor &Tin, const std::string which = "LM",
+                                   const cytnx_uint64 &maxiter = 10000,
+                                   const cytnx_double &cvg_crit = 1.0e-9, const cytnx_uint64 &k = 1,
+                                   const bool &is_V = true, const bool &verbose = false);
+
     // Lanczos:
     //===========================================
     /**
@@ -2390,6 +2434,42 @@ namespace cytnx {
                                           const bool &verbose = false,
                                           const unsigned int &Maxiter = 100000);
 
+    // Lanczos_Exp:
+    //===============================================
+    /**
+    @brief Perform the Lanczos algorithm for hermitian operator
+    \f$H\f$ to approximate \f$e^{H\tau}v\f$.
+    @details
+        This function perform the Lanczos-like algorithm for hermitian
+                linear operator \f$H\f$ to approximate
+                \f[
+                e^{H\tau}v
+                \f] and return the state \f$w\f$ such that
+                \f[
+                |\exp(H\tau)v - w| < \delta.
+                \f]
+                Here \f$v\f$ is a given vector or a state.
+    @param[in] Hop the Linear Operator defined by LinOp class or it's inheritance (see LinOp). The
+    operation method \f$Hv\f$ need to be defined in it.
+    @param[in] v The input vector (or state). The norm \f$|v|\f$ should be equal to 1.
+    @param[in] tau A scalar, it can be complex number.
+    @param[in] CvgCrit \f$\delta\f$, the convergence criterion.
+    @param[in] Maxiter the maximum interation steps for each k.
+    @param[in] verbose print out iteration info.
+    @return
+        UniTensor \f$w\f$
+    @note
+        To use, define a linear operator with LinOp class either by assign a custom function or
+    create a class that inherit LinOp (see LinOp for further details)
+        @warning
+                User need to guarantee that the input operator \f$H\f$ is Hermitian
+    , and the exponetiate \f$e^{-H\tau}\f$ will converged. Ohterwise, the function will return the
+    wrong results without any warning.
+    */
+    UniTensor Lanczos_Exp(LinOp *Hop, const UniTensor &v, const Scalar &tau,
+                          const double &CvgCrit = 1.0e-10, const unsigned int &Maxiter = 100000,
+                          const bool &verbose = false);
+
     // Lstsq:
     //===========================================
     /**
diff --git a/pybind/bond_py.cpp b/pybind/bond_py.cpp
index 589f45344..3e4f9e54d 100644
--- a/pybind/bond_py.cpp
+++ b/pybind/bond_py.cpp
@@ -100,8 +100,16 @@ void bond_binding(py::module &m) {
     .def("clone", &Bond::clone)
     .def("__copy__", &Bond::clone)
     .def("__deepcopy__", &Bond::clone)
-    .def("combineBond", &Bond::combineBond, py::arg("bd"), py::arg("is_grp") = true)
-    .def("combineBond_", &Bond::combineBond_, py::arg("bd"), py::arg("is_grp") = true)
+    .def("combineBond", [](Bond &self, const Bond &bd,
+                           bool is_grp = true) { return self.combineBond(bd, is_grp); })
+    .def("combineBond", [](Bond &self, const std::vector<Bond> &bds,
+                           bool is_grp = true) { return self.combineBonds(bds, is_grp); })
+    .def("combineBond_", [](Bond &self, const Bond &bd,
+                            bool is_grp = true) { return self.combineBond_(bd, is_grp); })
+    .def("combineBond_", [](Bond &self, const std::vector<Bond> &bds,
+                            bool is_grp = true) { return self.combineBonds_(bds, is_grp); })
+    // .def("combineBond", &Bond::combineBond, py::arg("bd"), py::arg("is_grp") = true)
+    // .def("combineBond_", &Bond::combineBond_, py::arg("bd"), py::arg("is_grp") = true)
     .def("combineBonds", &Bond::combineBonds, py::arg("bds"), py::arg("is_grp") = true)
     .def("combineBonds_", &Bond::combineBonds_, py::arg("bds"), py::arg("is_grp") = true)
     .def("getDegeneracies", [](Bond &self) { return self.getDegeneracies(); })
diff --git a/pybind/linalg_py.cpp b/pybind/linalg_py.cpp
index 36dfaed5e..d337deeaf 100644
--- a/pybind/linalg_py.cpp
+++ b/pybind/linalg_py.cpp
@@ -1,14 +1,14 @@
-#include <vector>
-#include <map>
-#include <random>
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <pybind11/operators.h>
-#include <pybind11/iostream.h>
-#include <pybind11/numpy.h>
 #include <pybind11/buffer_info.h>
 #include <pybind11/functional.h>
+#include <pybind11/iostream.h>
+#include <pybind11/numpy.h>
+#include <pybind11/operators.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <map>
+#include <random>
+#include <vector>
 
 #include "cytnx.hpp"
 // #include "../include/cytnx_error.hpp"
@@ -49,36 +49,36 @@ void linalg_binding(py::module &m) {
   m_linalg.def(
     "Gesvd_truncate",
     [](const Tensor &Tin, const cytnx_uint64 &keepdim, const cytnx_double &err, const bool &is_U,
-       const bool &is_vT, const unsigned int &return_err) {
-      return cytnx::linalg::Gesvd_truncate(Tin, keepdim, err, is_U, is_vT, return_err);
+       const bool &is_vT, const unsigned int &return_err, const unsigned int &mindim) {
+      return cytnx::linalg::Gesvd_truncate(Tin, keepdim, err, is_U, is_vT, return_err, mindim);
     },
     py::arg("Tin"), py::arg("keepdim"), py::arg("err") = double(0), py::arg("is_U") = true,
-    py::arg("is_vT") = true, py::arg("return_err") = (unsigned int)(0));
+    py::arg("is_vT") = true, py::arg("return_err") = (unsigned int)(0), py::arg("mindim") = 0);
   m_linalg.def(
     "Gesvd_truncate",
     [](const UniTensor &Tin, const cytnx_uint64 &keepdim, const cytnx_double &err, const bool &is_U,
-       const bool &is_vT, const unsigned int &return_err) {
-      return cytnx::linalg::Gesvd_truncate(Tin, keepdim, err, is_U, is_vT, return_err);
+       const bool &is_vT, const unsigned int &return_err, const unsigned int &mindim) {
+      return cytnx::linalg::Gesvd_truncate(Tin, keepdim, err, is_U, is_vT, return_err, mindim);
     },
     py::arg("Tin"), py::arg("keepdim"), py::arg("err") = 0, py::arg("is_U") = true,
-    py::arg("is_vT") = true, py::arg("return_err") = (unsigned int)(0));
+    py::arg("is_vT") = true, py::arg("return_err") = (unsigned int)(0), py::arg("mindim") = 0);
 
   m_linalg.def(
     "Svd_truncate",
     [](const Tensor &Tin, const cytnx_uint64 &keepdim, const cytnx_double &err, const bool &is_UvT,
-       const unsigned int &return_err) {
-      return cytnx::linalg::Svd_truncate(Tin, keepdim, err, is_UvT, return_err);
+       const unsigned int &return_err, const unsigned int &mindim) {
+      return cytnx::linalg::Svd_truncate(Tin, keepdim, err, is_UvT, return_err, mindim);
     },
     py::arg("Tin"), py::arg("keepdim"), py::arg("err") = double(0), py::arg("is_UvT") = true,
-    py::arg("return_err") = (unsigned int)(0));
+    py::arg("return_err") = (unsigned int)(0), py::arg("mindim") = 0);
   m_linalg.def(
     "Svd_truncate",
     [](const UniTensor &Tin, const cytnx_uint64 &keepdim, const cytnx_double &err,
-       const bool &is_UvT, const unsigned int &return_err) {
-      return cytnx::linalg::Svd_truncate(Tin, keepdim, err, is_UvT, return_err);
+       const bool &is_UvT, const unsigned int &return_err, const unsigned int &mindim) {
+      return cytnx::linalg::Svd_truncate(Tin, keepdim, err, is_UvT, return_err, mindim);
     },
     py::arg("Tin"), py::arg("keepdim"), py::arg("err") = 0, py::arg("is_UvT") = true,
-    py::arg("return_err") = (unsigned int)(0));
+    py::arg("return_err") = (unsigned int)(0), py::arg("mindim") = 0);
 
   // m_linalg.def("Eigh", &cytnx::linalg::Eigh, py::arg("Tin"), py::arg("is_V") = true,
   //              py::arg("row_v") = false);
@@ -523,6 +523,25 @@ void linalg_binding(py::module &m) {
     py::arg("Tn"), py::arg("mode"), py::arg("is_core") = true, py::arg("is_Ls") = false,
     py::arg("truncate_dim") = std::vector<cytnx_int64>());
 
+  m_linalg.def(
+    "Arnoldi",
+    [](LinOp *Hop, const Tensor &Tin, const std::string which, const cytnx_uint64 &Maxiter,
+       const double &CvgCrit, const cytnx_uint64 &k, const bool &is_V, const bool &verbose) {
+      return cytnx::linalg::Arnoldi(Hop, Tin, which, Maxiter, CvgCrit, k, is_V, verbose);
+    },
+    py::arg("Hop"), py::arg("Tin"), py::arg("which") = "LM", py::arg("Maxiter") = 10000,
+    py::arg("CvgCrit") = 1.0e-9, py::arg("k") = 1, py::arg("is_V") = true,
+    py::arg("verbose") = false);
+  m_linalg.def(
+    "Arnoldi",
+    [](LinOp *Hop, const UniTensor &Tin, const std::string which, const cytnx_uint64 &Maxiter,
+       const double &CvgCrit, const cytnx_uint64 &k, const bool &is_V, const bool &verbose) {
+      return cytnx::linalg::Arnoldi(Hop, Tin, which, Maxiter, CvgCrit, k, is_V, verbose);
+    },
+    py::arg("Hop"), py::arg("Tin"), py::arg("which") = "LM", py::arg("Maxiter") = 10000,
+    py::arg("CvgCrit") = 1.0e-9, py::arg("k") = 1, py::arg("is_V") = true,
+    py::arg("verbose") = false);
+
   m_linalg.def(
     "Lanczos",
     [](LinOp *Hop, const Tensor &Tin, const std::string method, const double &CvgCrit,
@@ -546,6 +565,15 @@ void linalg_binding(py::module &m) {
     py::arg("Maxiter") = 10000, py::arg("k") = 1, py::arg("is_V") = true, py::arg("is_row") = false,
     py::arg("max_krydim") = 0, py::arg("verbose") = false);
 
+  m_linalg.def(
+    "Lanczos_Exp",
+    [](LinOp *Hop, const UniTensor &v, const Scalar &tau, const double &CvgCrit,
+       const unsigned int &Maxiter, const bool &verbose) {
+      return cytnx::linalg::Lanczos_Exp(Hop, v, tau, CvgCrit, Maxiter, verbose);
+    },
+    py::arg("Hop"), py::arg("v"), py::arg("tau"), py::arg("CvgCrit") = 1.0e-14,
+    py::arg("Maxiter") = 10000, py::arg("verbose") = false);
+
   m_linalg.def(
     "Lstsq",
     [](const Tensor &A, const Tensor &b, const float &rcond) {
diff --git a/pybind/unitensor_py.cpp b/pybind/unitensor_py.cpp
index b77d385a3..a841fafba 100644
--- a/pybind/unitensor_py.cpp
+++ b/pybind/unitensor_py.cpp
@@ -143,11 +143,16 @@ void unitensor_binding(py::module &m) {
     .def("c_set_rowrank_", &UniTensor::set_rowrank_, py::arg("new_rowrank"))
 
     .def("set_rowrank", &UniTensor::set_rowrank, py::arg("new_rowrank"))
-
+    .def("relabel",[](UniTensor &self, const std::vector<std::string> &new_labels){
+                        return self.relabel(new_labels);
+                    }, py::arg("new_labels"))
     .def("relabels",[](UniTensor &self, const std::vector<std::string> &new_labels){
                         return self.relabels(new_labels);
                     }, py::arg("new_labels"))
 
+     .def("c_relabel_",[](UniTensor &self, const std::vector<std::string> &new_labels){
+                        self.relabel_(new_labels);
+                    }, py::arg("new_labels"))
      .def("c_relabels_",[](UniTensor &self, const std::vector<std::string> &new_labels){
                         self.relabels_(new_labels);
                     }, py::arg("new_labels"))
@@ -169,6 +174,14 @@ void unitensor_binding(py::module &m) {
                         },py::arg("old_label"), py::arg("new_label"))
 
 
+    .def("relabel",[](UniTensor &self, const std::vector<std::string> &old_labels, const std::vector<std::string> &new_labels){
+                        return self.relabel(old_labels,new_labels);
+                    } ,py::arg("old_labels"), py::arg("new_labels"))
+
+    .def("c_relabel_",[](UniTensor &self, const std::vector<std::string> &old_labels, const std::vector<std::string> &new_labels){
+                        self.relabel_(old_labels,new_labels);
+                    } ,py::arg("old_labels"), py::arg("new_labels"))
+
     .def("relabels",[](UniTensor &self, const std::vector<std::string> &old_labels, const std::vector<std::string> &new_labels){
                         return self.relabels(old_labels,new_labels);
                     } ,py::arg("old_labels"), py::arg("new_labels"))
@@ -1372,8 +1385,18 @@ void unitensor_binding(py::module &m) {
      .def("get_qindices",  [](UniTensor &self, const cytnx_uint64 &bidx){return self.get_qindices(bidx);});
   ;  // end of object line
 
-  m.def("Contract", Contract, py::arg("Tl"), py::arg("Tr"), py::arg("cacheL") = false,
-        py::arg("cacheR") = false);
+  //   m.def("Contract", Contract, py::arg("Tl"), py::arg("Tr"), py::arg("cacheL") = false,
+  //         py::arg("cacheR") = false);
+  m.def(
+    "Contract",
+    [](const UniTensor &inL, const UniTensor &inR, const bool &cacheL,
+       const bool &cacheR) -> UniTensor { return Contract(inL, inR, cacheL, cacheR); },
+    py::arg("Tl"), py::arg("Tr"), py::arg("cacheL") = false, py::arg("cacheR") = false);
+  m.def(
+    "Contract",
+    [](const std::vector<UniTensor> &TNs, const std::string &order,
+       const bool &optimal) -> UniTensor { return Contract(TNs, order, optimal); },
+    py::arg("TNs"), py::arg("order") = "", py::arg("optimal") = true);
   m.def(
     "Contracts",
     [](const std::vector<UniTensor> &TNs, const std::string &order,
diff --git a/pytests/example/TDVP/tdvp1_dense_test.py b/pytests/example/TDVP/tdvp1_dense_test.py
new file mode 100644
index 000000000..7000640f5
--- /dev/null
+++ b/pytests/example/TDVP/tdvp1_dense_test.py
@@ -0,0 +1,25 @@
+import sys,os
+import numpy as np
+sys.path.append('example/TDVP/')
+
+from tdvp1_dense import *
+from numpy import linalg as LA
+
+def test_tdvp1_dense():
+    #prepare random MPS
+    Nsites = 20 # Number of sites
+    chi = 16 # MPS bond dimension
+    d = 2
+    MPS_rand = prepare_rand_init_MPS(Nsites, chi, d)
+
+    # simulate ground state by imaginary time evolution by tdvp
+    J = 0.0
+    Jz = 0.0
+    hx = 0.0
+    hz = -1.0
+    tau = -1.0j
+    time_step = 10
+    # prepare up state
+    As, Es = tdvp1_XXZmodel_dense(J, Jz, hx, hz, MPS_rand, chi, tau, time_step)
+    error = np.abs(Es[-1]-(-1.0*Nsites))
+    assert error < 1e-6
diff --git a/src/BlockUniTensor.cpp b/src/BlockUniTensor.cpp
index a61fe5b6b..17f2198d0 100644
--- a/src/BlockUniTensor.cpp
+++ b/src/BlockUniTensor.cpp
@@ -672,6 +672,24 @@ namespace cytnx {
     this->permute_(mapper_i64, rowrank);
   }
 
+  boost::intrusive_ptr<UniTensor_base> BlockUniTensor::relabel(
+    const std::vector<string> &new_labels) {
+    BlockUniTensor *tmp = this->clone_meta(true, true);
+    tmp->_blocks = this->_blocks;
+    tmp->set_labels(new_labels);
+    boost::intrusive_ptr<UniTensor_base> out(tmp);
+    return out;
+  }
+
+  boost::intrusive_ptr<UniTensor_base> BlockUniTensor::relabel(
+    const std::vector<std::string> &old_labels, const std::vector<std::string> &new_labels) {
+    BlockUniTensor *tmp = this->clone_meta(true, true);
+    tmp->_blocks = this->_blocks;
+    tmp->relabel_(old_labels, new_labels);
+    boost::intrusive_ptr<UniTensor_base> out(tmp);
+    return out;
+  }
+
   boost::intrusive_ptr<UniTensor_base> BlockUniTensor::relabels(
     const std::vector<string> &new_labels) {
     BlockUniTensor *tmp = this->clone_meta(true, true);
@@ -1825,6 +1843,7 @@ namespace cytnx {
     this->_fx_group_duplicates(has_dup, idx_mappers);
   }
 
+  // Deprecated, internal use only
   void BlockUniTensor::combineBonds(const std::vector<cytnx_int64> &indicators, const bool &force) {
     cytnx_error_msg(this->is_diag(),
                     "[ERROR][BlockUniTensor] cannot combineBonds when is_diag = true!%s", "\n");
@@ -1956,7 +1975,7 @@ namespace cytnx {
     this->group_basis_();
   }
 
-  void BlockUniTensor::combineBonds(const std::vector<std::string> &indicators, const bool &force) {
+  void BlockUniTensor::combineBond(const std::vector<std::string> &indicators, const bool &force) {
     cytnx_error_msg(indicators.size() < 2, "[ERROR] the number of bonds to combine must be > 1%s",
                     "\n");
     std::vector<std::string>::iterator it;
@@ -1971,6 +1990,11 @@ namespace cytnx {
     this->combineBonds(idx_mapper, force);
   }
 
+  void BlockUniTensor::combineBonds(const std::vector<std::string> &indicators, const bool &force) {
+    this->combineBond(indicators, force);
+  }
+
+  // Deprecated
   void BlockUniTensor::combineBonds(const std::vector<cytnx_int64> &indicators, const bool &force,
                                     const bool &by_label) {
     cytnx_error_msg(indicators.size() < 2, "[ERROR] the number of bonds to combine must be > 1%s",
diff --git a/src/DenseUniTensor.cpp b/src/DenseUniTensor.cpp
index 04f623ac8..c2600ca14 100644
--- a/src/DenseUniTensor.cpp
+++ b/src/DenseUniTensor.cpp
@@ -214,6 +214,24 @@ namespace cytnx {
     }
   }
 
+  boost::intrusive_ptr<UniTensor_base> DenseUniTensor::relabel(
+    const std::vector<std::string> &new_labels) {
+    DenseUniTensor *out_raw = this->clone_meta();
+    out_raw->_block = this->_block;
+    out_raw->set_labels(new_labels);
+    boost::intrusive_ptr<UniTensor_base> out(out_raw);
+    return out;
+  }
+
+  boost::intrusive_ptr<UniTensor_base> DenseUniTensor::relabel(
+    const std::vector<std::string> &old_labels, const std::vector<std::string> &new_labels) {
+    DenseUniTensor *tmp = this->clone_meta();
+    tmp->_block = this->_block;
+    tmp->relabel_(old_labels, new_labels);
+    boost::intrusive_ptr<UniTensor_base> out(tmp);
+    return out;
+  }
+
   boost::intrusive_ptr<UniTensor_base> DenseUniTensor::relabels(
     const std::vector<std::string> &new_labels) {
     DenseUniTensor *out_raw = this->clone_meta();
@@ -668,7 +686,7 @@ namespace cytnx {
     }
     return out;
   }
-  void DenseUniTensor::combineBonds(const std::vector<std::string> &indicators, const bool &force) {
+  void DenseUniTensor::combineBond(const std::vector<std::string> &indicators, const bool &force) {
     cytnx_error_msg(indicators.size() < 2, "[ERROR] the number of bonds to combine must be > 1%s",
                     "\n");
     std::vector<std::string>::iterator it;
@@ -682,6 +700,11 @@ namespace cytnx {
     }
     this->combineBonds(idx_mapper, force);
   }
+  // Deprecated
+  void DenseUniTensor::combineBonds(const std::vector<std::string> &indicators, const bool &force) {
+    this->combineBond(indicators, force);
+  }
+  // Deprecated
   void DenseUniTensor::combineBonds(const std::vector<cytnx_int64> &indicators, const bool &force) {
     cytnx_error_msg(indicators.size() < 2, "[ERROR] the number of bonds to combine must be > 1%s",
                     "\n");
@@ -876,6 +899,7 @@ namespace cytnx {
   }
   */
 
+  // Deprecated
   void DenseUniTensor::combineBonds(const std::vector<cytnx_int64> &indicators, const bool &force,
                                     const bool &by_label) {
     cytnx_error_msg(indicators.size() < 2, "[ERROR] the number of bonds to combine must be > 1%s",
diff --git a/src/LinOp.cpp b/src/LinOp.cpp
index 7f84002a5..40a244afa 100644
--- a/src/LinOp.cpp
+++ b/src/LinOp.cpp
@@ -33,7 +33,7 @@ namespace cytnx {
 
   #ifdef UNI_OMP
 
-    //#pragma omp parallel for
+    // #pragma omp parallel for
     for (cytnx_uint64 x = 0; x < this->_elems.size(); x++) {
       auto it = this->_elems.begin();
       advance(it, x);
diff --git a/src/RegularNetwork.cpp b/src/RegularNetwork.cpp
index fd5567736..5bd421901 100644
--- a/src/RegularNetwork.cpp
+++ b/src/RegularNetwork.cpp
@@ -43,40 +43,55 @@ namespace cytnx {
     vector<string> tmp = str_split(line, false, ";");
     // cytnx_error_msg(tmp.size() != 2, "[ERROR][Network][Fromfile] line:%d %s\n", line_num,
     //                 "Invalid TOUT line");
-    if (tmp.size() != 2) {
-      // tmp.push_back("");
-      tmp.insert(tmp.begin(), "");
-    }
-
-    // handle col-space label
-    vector<string> ket_labels = str_split(tmp[0], false, ",");
-    if (ket_labels.size() == 1)
-      if (ket_labels[0].length() == 0) ket_labels.clear();
-    for (cytnx_uint64 i = 0; i < ket_labels.size(); i++) {
-      string tmp = str_strip(ket_labels[i]);
-      cytnx_error_msg(tmp.length() == 0,
-                      "[ERROR][Network][Fromfile] line:%d Invalid labels for TOUT line.%s",
-                      line_num, "\n");
-      // cytnx_error_msg((tmp.find_first_not_of("0123456789-") != string::npos),
-      //                 "[ERROR][Network][Fromfile] line:%d %s\n", line_num,
-      //                 "Invalid TOUT line. label contain non integer.");
-      labels.push_back(tmp);
-    }
-    TOUT_iBondNum = labels.size();
-
-    // handle row-space label
-    vector<string> bra_labels = str_split(tmp[1], false, ",");
-    if (bra_labels.size() == 1)
-      if (bra_labels[0].length() == 0) bra_labels.clear();
-    for (cytnx_uint64 i = 0; i < bra_labels.size(); i++) {
-      string tmp = str_strip(bra_labels[i]);
-      cytnx_error_msg(tmp.length() == 0,
-                      "[ERROR][Network][Fromfile] line:%d Invalid labels for TOUT line.%s",
-                      line_num, "\n");
-      // cytnx_error_msg((tmp.find_first_not_of("0123456789-") != string::npos),
-      //                 "[ERROR][Network][Fromfile] line:%d %s\n", line_num,
-      //                 "Invalid TOUT line. label contain non integer.");
-      labels.push_back(tmp);
+    if (tmp.size() == 1) {
+      // no ; provided
+      vector<string> tout_lbls = str_split(line, false, ",");
+      cytnx_uint64 default_rowrank = tout_lbls.size() / 2;
+      // handle col-space label
+      for (cytnx_uint64 i = 0; i < default_rowrank; i++) {
+        string tmp_ = str_strip(tout_lbls[i]);
+        cytnx_error_msg(tmp_.length() == 0,
+                        "[ERROR][Network][Fromfile] line:%d Invalid labels for TOUT line.%s",
+                        line_num, "\n");
+        labels.push_back(tmp_);
+      }
+      TOUT_iBondNum = labels.size();
+      // handle row-space label
+      for (cytnx_uint64 i = default_rowrank; i < tout_lbls.size(); i++) {
+        string tmp_ = str_strip(tout_lbls[i]);
+        cytnx_error_msg(tmp_.length() == 0,
+                        "[ERROR][Network][Fromfile] line:%d Invalid labels for TOUT line.%s",
+                        line_num, "\n");
+        labels.push_back(tmp_);
+      }
+    } else if (tmp.size() == 2) {
+      // one ;
+      // handle col-space label
+      vector<string> ket_labels = str_split(tmp[0], false, ",");
+      if (ket_labels.size() == 1)
+        if (ket_labels[0].length() == 0) ket_labels.clear();
+      for (cytnx_uint64 i = 0; i < ket_labels.size(); i++) {
+        string tmp = str_strip(ket_labels[i]);
+        cytnx_error_msg(tmp.length() == 0,
+                        "[ERROR][Network][Fromfile] line:%d Invalid labels for TOUT line.%s",
+                        line_num, "\n");
+        labels.push_back(tmp);
+      }
+      TOUT_iBondNum = labels.size();
+      // handle row-space label
+      vector<string> bra_labels = str_split(tmp[1], false, ",");
+      if (bra_labels.size() == 1)
+        if (bra_labels[0].length() == 0) bra_labels.clear();
+      for (cytnx_uint64 i = 0; i < bra_labels.size(); i++) {
+        string tmp = str_strip(bra_labels[i]);
+        cytnx_error_msg(tmp.length() == 0,
+                        "[ERROR][Network][Fromfile] line:%d Invalid labels for TOUT line.%s",
+                        line_num, "\n");
+        labels.push_back(tmp);
+      }
+    } else if (tmp.size() > 2) {
+      // more than one ;
+      cytnx_error_msg(true, "[ERROR][Network] line:%d Invalid TOUT line.%s", line_num, "\n");
     }
   }
 
@@ -211,8 +226,16 @@ namespace cytnx {
     return res;
   }
 
+  /*
+   * This function is not used in the codebase and not implemented properly.
+   * Currently disabled.
+   */
   vector<pair<cytnx_int64, cytnx_int64>> CtTree_to_eisumpath(ContractionTree CtTree,
                                                              vector<string> tns) {
+    cytnx_error_msg(true,
+                    "[Error][RegularNetwork] CtTree_to_eisumpath is not properly implemented, "
+                    "disabled for now.%s",
+                    "\n");
     vector<pair<cytnx_int64, cytnx_int64>> path;
     stack<Node *> stk;
     Node *root = &(CtTree.nodes_container.back());
@@ -622,7 +645,7 @@ namespace cytnx {
     } else {
       CtTree.build_default_contraction_tree();
     }
-    this->einsum_path = CtTree_to_eisumpath(CtTree, names);
+    // this->einsum_path = CtTree_to_eisumpath(CtTree, names);
   }
 
   void RegularNetwork::Fromfile(const string &fname) {
@@ -909,7 +932,7 @@ namespace cytnx {
       if (contract_order != "") {
         _parse_ORDER_line_(ORDER_tokens, contract_order, 999999);
         CtTree.build_contraction_tree_by_tokens(this->name2pos, ORDER_tokens);
-        this->einsum_path = CtTree_to_eisumpath(CtTree, names);
+        // this->einsum_path = CtTree_to_eisumpath(CtTree, names);
       } else {
         CtTree.build_default_contraction_tree();
       }
@@ -1267,7 +1290,7 @@ namespace cytnx {
     } else {
       CtTree.build_default_contraction_tree();
     }
-    this->einsum_path = CtTree_to_eisumpath(CtTree, names);
+    // this->einsum_path = CtTree_to_eisumpath(CtTree, names);
   }  // end construct
 
 }  // namespace cytnx
diff --git a/src/UniTensor_base.cpp b/src/UniTensor_base.cpp
index 71f794573..2f9f0fd06 100644
--- a/src/UniTensor_base.cpp
+++ b/src/UniTensor_base.cpp
@@ -272,6 +272,50 @@ namespace cytnx {
     this->_labels = new_labels;
   }
 
+  boost::intrusive_ptr<UniTensor_base> UniTensor_base::relabel(
+    const std::vector<std::string> &new_labels) {
+    cytnx_error_msg(true, "[ERROR] fatal internal, cannot call on a un-initialize UniTensor_base%s",
+                    "\n");
+    return nullptr;
+  }
+  void UniTensor_base::relabel_(const std::vector<std::string> &new_labels) {
+    this->set_labels(new_labels);
+  }
+  boost::intrusive_ptr<UniTensor_base> UniTensor_base::relabel(
+    const std::vector<std::string> &old_labels, const std::vector<std::string> &new_labels) {
+    cytnx_error_msg(true, "[ERROR] fatal internal, cannot call on a un-initialize UniTensor_base%s",
+                    "\n");
+    return nullptr;
+  }
+  void UniTensor_base::relabel_(const std::vector<std::string> &old_labels,
+                                const std::vector<std::string> &new_labels) {
+    cytnx_error_msg(old_labels.size() != new_labels.size(),
+                    "[ERROR] old_labels and new_labels should have the same size.%s", "\n");
+
+    // 1) checking old_labels have no duplicate and are all exists:
+    // 2) getting all the replace locations
+    cytnx_error_msg(vec_unique<std::string>(old_labels).size() != old_labels.size(),
+                    "[ERROR] old_labels cannot have duplicated elements.%s", "\n");
+
+    std::vector<cytnx_int64> loc_inds(old_labels.size());
+    for (cytnx_int64 i = 0; i < loc_inds.size(); i++) {
+      auto res = std::find(this->_labels.begin(), this->_labels.end(), old_labels[i]);
+      cytnx_error_msg(res == this->_labels.end(), "[ERROR] label %s not exists.\n",
+                      old_labels[i].c_str());
+
+      loc_inds[i] = std::distance(this->_labels.begin(), res);
+    }
+
+    // 3) replace the labels:
+    for (cytnx_int64 i = 0; i < loc_inds.size(); i++) {
+      this->_labels[loc_inds[i]] = new_labels[i];
+    }
+
+    // 4) chk duplicate:
+    cytnx_error_msg(vec_unique<std::string>(this->_labels).size() != this->_labels.size(),
+                    "[ERROR] final output UniTensor have duplicated elements.%s", "\n");
+  }
+
   boost::intrusive_ptr<UniTensor_base> UniTensor_base::relabels(
     const std::vector<std::string> &new_labels) {
     cytnx_error_msg(true, "[ERROR] fatal internal, cannot call on a un-initialize UniTensor_base%s",
@@ -340,16 +384,23 @@ namespace cytnx {
     cytnx_error_msg(true, "[ERROR] fatal internal, cannot call on a un-initialize UniTensor_base%s",
                     "\n");
   }
+  // Deprecated
   void UniTensor_base::combineBonds(const std::vector<cytnx_int64> &indicators,
                                     const bool &permute_back, const bool &by_labels) {
     cytnx_error_msg(true, "[ERROR] fatal internal, cannot call on a un-initialize UniTensor_base%s",
                     "\n");
   }
+  void UniTensor_base::combineBond(const std::vector<std::string> &indicators,
+                                   const bool &permute_back) {
+    cytnx_error_msg(true, "[ERROR] fatal internal, cannot call on a un-initialize UniTensor_base%s",
+                    "\n");
+  }
   void UniTensor_base::combineBonds(const std::vector<std::string> &indicators,
                                     const bool &permute_back) {
     cytnx_error_msg(true, "[ERROR] fatal internal, cannot call on a un-initialize UniTensor_base%s",
                     "\n");
   }
+  // Deprecated
   void UniTensor_base::combineBonds(const std::vector<cytnx_int64> &indicators,
                                     const bool &permute_back) {
     cytnx_error_msg(true, "[ERROR] fatal internal, cannot call on a un-initialize UniTensor_base%s",
@@ -590,6 +641,10 @@ namespace cytnx {
   }
 
   void _resolve_CT(std::vector<UniTensor> &TNlist){};
+  UniTensor Contract(const std::vector<UniTensor> &TNs, const std::string &order = "",
+                     const bool &optimal = true) {
+    return Contracts(TNs, order, optimal);
+  }
   UniTensor Contracts(const std::vector<UniTensor> &TNs, const std::string &order = "",
                       const bool &optimal = true) {
     cytnx_error_msg(TNs.size() < 2, "[ERROR][Contracts] should have more than 2 TNs to contract.%s",
diff --git a/src/backend/linalg_internal_cpu/Kron_internal.cpp b/src/backend/linalg_internal_cpu/Kron_internal.cpp
index ff98209ff..8634f2969 100644
--- a/src/backend/linalg_internal_cpu/Kron_internal.cpp
+++ b/src/backend/linalg_internal_cpu/Kron_internal.cpp
@@ -3,7 +3,7 @@
 #include "utils/complex_arithmetic.hpp"
 #include "../utils_internal_interface.hpp"
 #include <algorithm>
-//#include "backend/lapack_wrapper.hpp"
+// #include "backend/lapack_wrapper.hpp"
 #ifdef UNI_OMP
   #include <omp.h>
 #endif
diff --git a/src/backend/linalg_internal_cpu/Mul_internal.cpp b/src/backend/linalg_internal_cpu/Mul_internal.cpp
index 67a8adbb2..b3497f218 100644
--- a/src/backend/linalg_internal_cpu/Mul_internal.cpp
+++ b/src/backend/linalg_internal_cpu/Mul_internal.cpp
@@ -23,24 +23,24 @@ namespace cytnx {
       blas_int N = len;
       blas_int ONE = 1;
       if (Lin->size() == 1) {
-        //#ifdef UNI_OMP
-        //    #pragma omp parallel for schedule(dynamic)
-        //#endif
-        // for(unsigned long long i=0;i<len;i++){
-        //    _out[i] = _Lin[0] * _Rin[i];
-        //}
-        // std::cout << "entroy internal" << std::endl;
-        // std::cout << *_Lin << std::endl;
+        // #ifdef UNI_OMP
+        //     #pragma omp parallel for schedule(dynamic)
+        // #endif
+        //  for(unsigned long long i=0;i<len;i++){
+        //     _out[i] = _Lin[0] * _Rin[i];
+        // }
+        //  std::cout << "entroy internal" << std::endl;
+        //  std::cout << *_Lin << std::endl;
         memcpy(_out, _Rin, sizeof(cytnx_complex128) * len);
         zscal(&N, _Lin, _out, &ONE);
 
       } else if (Rin->size() == 1) {
-        //#ifdef UNI_OMP
-        //#pragma omp parallel for schedule(dynamic)
-        //#endif
-        // for(unsigned long long i=0;i<len;i++){
-        //    _out[i] = _Lin[i] * _Rin[0];
-        //}
+        // #ifdef UNI_OMP
+        // #pragma omp parallel for schedule(dynamic)
+        // #endif
+        //  for(unsigned long long i=0;i<len;i++){
+        //     _out[i] = _Lin[i] * _Rin[0];
+        // }
         memcpy(_out, _Lin, sizeof(cytnx_complex128) * len);
         zscal(&N, _Rin, _out, &ONE);
 
@@ -100,12 +100,12 @@ namespace cytnx {
           _out[i] = _Lin[0] * _Rin[i];
         }
       } else if (Rin->size() == 1) {
-        //#ifdef UNI_OMP
-        //#pragma omp parallel for schedule(dynamic)
-        //#endif
-        // for(unsigned long long i=0;i<len;i++){
-        //    _out[i] = _Lin[i] * _Rin[0];
-        //}
+        // #ifdef UNI_OMP
+        // #pragma omp parallel for schedule(dynamic)
+        // #endif
+        //  for(unsigned long long i=0;i<len;i++){
+        //     _out[i] = _Lin[i] * _Rin[0];
+        // }
         cytnx_complex128 a = _Rin[0];
         memcpy(_out, _Lin, sizeof(cytnx_complex128) * len);
         zscal(&N, &a, _out, &ONE);
@@ -165,12 +165,12 @@ namespace cytnx {
           _out[i] = _Lin[0] * _Rin[i];
         }
       } else if (Rin->size() == 1) {
-        //#ifdef UNI_OMP
-        //#pragma omp parallel for schedule(dynamic)
-        //#endif
-        // for(unsigned long long i=0;i<len;i++){
-        //    _out[i] = _Lin[i] * _Rin[0];
-        //}
+        // #ifdef UNI_OMP
+        // #pragma omp parallel for schedule(dynamic)
+        // #endif
+        //  for(unsigned long long i=0;i<len;i++){
+        //     _out[i] = _Lin[i] * _Rin[0];
+        // }
         cytnx_complex128 a = _Rin[0];
         memcpy(_out, _Lin, sizeof(cytnx_complex128) * len);
         zscal(&N, &a, _out, &ONE);
@@ -231,12 +231,12 @@ namespace cytnx {
           _out[i] = _Lin[0] * _Rin[i];
         }
       } else if (Rin->size() == 1) {
-        //#ifdef UNI_OMP
-        //#pragma omp parallel for schedule(dynamic)
-        //#endif
-        // for(unsigned long long i=0;i<len;i++){
-        //    _out[i] = _Lin[i] * _Rin[0];
-        //}
+        // #ifdef UNI_OMP
+        // #pragma omp parallel for schedule(dynamic)
+        // #endif
+        //  for(unsigned long long i=0;i<len;i++){
+        //     _out[i] = _Lin[i] * _Rin[0];
+        // }
         cytnx_complex128 a = _Rin[0];
         memcpy(_out, _Lin, sizeof(cytnx_complex128) * len);
         zscal(&N, &a, _out, &ONE);
@@ -297,12 +297,12 @@ namespace cytnx {
           _out[i] = _Lin[0] * _Rin[i];
         }
       } else if (Rin->size() == 1) {
-        //#ifdef UNI_OMP
-        //#pragma omp parallel for schedule(dynamic)
-        //#endif
-        // for(unsigned long long i=0;i<len;i++){
-        //    _out[i] = _Lin[i] * _Rin[0];
-        //}
+        // #ifdef UNI_OMP
+        // #pragma omp parallel for schedule(dynamic)
+        // #endif
+        //  for(unsigned long long i=0;i<len;i++){
+        //     _out[i] = _Lin[i] * _Rin[0];
+        // }
         cytnx_complex128 a = _Rin[0];
         memcpy(_out, _Lin, sizeof(cytnx_complex128) * len);
         zscal(&N, &a, _out, &ONE);
@@ -363,12 +363,12 @@ namespace cytnx {
           _out[i] = _Lin[0] * _Rin[i];
         }
       } else if (Rin->size() == 1) {
-        //#ifdef UNI_OMP
-        //#pragma omp parallel for schedule(dynamic)
-        //#endif
-        // for(unsigned long long i=0;i<len;i++){
-        //    _out[i] = _Lin[i] * _Rin[0];
-        //}
+        // #ifdef UNI_OMP
+        // #pragma omp parallel for schedule(dynamic)
+        // #endif
+        //  for(unsigned long long i=0;i<len;i++){
+        //     _out[i] = _Lin[i] * _Rin[0];
+        // }
         cytnx_complex128 a = _Rin[0];
         memcpy(_out, _Lin, sizeof(cytnx_complex128) * len);
         zscal(&N, &a, _out, &ONE);
@@ -562,12 +562,12 @@ namespace cytnx {
           _out[i] = _Lin[0] * cytnx_complex128(_Rin[i], 0);
         }
       } else if (Rin->size() == 1) {
-        //#ifdef UNI_OMP
-        //#pragma omp parallel for schedule(dynamic)
-        //#endif
-        // for(unsigned long long i=0;i<len;i++){
-        //    _out[i] = _Lin[i] * cytnx_complex128(_Rin[0],0);
-        //}
+        // #ifdef UNI_OMP
+        // #pragma omp parallel for schedule(dynamic)
+        // #endif
+        //  for(unsigned long long i=0;i<len;i++){
+        //     _out[i] = _Lin[i] * cytnx_complex128(_Rin[0],0);
+        // }
         cytnx_complex128 a = _Rin[0];
         memcpy(_out, _Lin, sizeof(cytnx_complex128) * len);
         zscal(&N, &a, _out, &ONE);
@@ -1466,22 +1466,22 @@ namespace cytnx {
       blas_int ONE = 1;
 
       if (Lin->size() == 1) {
-        //#ifdef UNI_OMP
-        //#pragma omp parallel for schedule(dynamic)
-        //#endif
-        // for(unsigned long long i=0;i<len;i++){
-        //    _out[i] = _Lin[0] * _Rin[i];
-        //}
+        // #ifdef UNI_OMP
+        // #pragma omp parallel for schedule(dynamic)
+        // #endif
+        //  for(unsigned long long i=0;i<len;i++){
+        //     _out[i] = _Lin[0] * _Rin[i];
+        // }
         memcpy(_out, _Rin, sizeof(double) * len);
         dscal(&N, _Lin, _out, &ONE);
 
       } else if (Rin->size() == 1) {
-        //#ifdef UNI_OMP
-        //#pragma omp parallel for schedule(dynamic)
-        //#endif
-        // for(unsigned long long i=0;i<len;i++){
-        //    _out[i] = _Lin[i] * _Rin[0];
-        //}
+        // #ifdef UNI_OMP
+        // #pragma omp parallel for schedule(dynamic)
+        // #endif
+        //  for(unsigned long long i=0;i<len;i++){
+        //     _out[i] = _Lin[i] * _Rin[0];
+        // }
         memcpy(_out, _Lin, sizeof(double) * len);
         dscal(&N, _Rin, _out, &ONE);
 
@@ -2098,22 +2098,22 @@ namespace cytnx {
       blas_int ONE = 1;
 
       if (Lin->size() == 1) {
-        //#ifdef UNI_OMP
-        //#pragma omp parallel for schedule(dynamic)
-        //#endif
-        // for(unsigned long long i=0;i<len;i++){
-        //    _out[i] = _Lin[0] * _Rin[i];
-        //}
+        // #ifdef UNI_OMP
+        // #pragma omp parallel for schedule(dynamic)
+        // #endif
+        //  for(unsigned long long i=0;i<len;i++){
+        //     _out[i] = _Lin[0] * _Rin[i];
+        // }
         memcpy(_out, _Rin, sizeof(float) * len);
         sscal(&N, _Lin, _out, &ONE);
 
       } else if (Rin->size() == 1) {
-        //#ifdef UNI_OMP
-        //#pragma omp parallel for schedule(dynamic)
-        //#endif
-        // for(unsigned long long i=0;i<len;i++){
-        //    _out[i] = _Lin[i] * _Rin[0];
-        //}
+        // #ifdef UNI_OMP
+        // #pragma omp parallel for schedule(dynamic)
+        // #endif
+        //  for(unsigned long long i=0;i<len;i++){
+        //     _out[i] = _Lin[i] * _Rin[0];
+        // }
         memcpy(_out, _Lin, sizeof(float) * len);
         sscal(&N, _Rin, _out, &ONE);
       } else {
diff --git a/src/backend/linalg_internal_cpu/memcpyTruncation.cpp b/src/backend/linalg_internal_cpu/memcpyTruncation.cpp
index 987b7df75..1c8ff58e8 100644
--- a/src/backend/linalg_internal_cpu/memcpyTruncation.cpp
+++ b/src/backend/linalg_internal_cpu/memcpyTruncation.cpp
@@ -1,9 +1,10 @@
-#include <stdlib.h>
+#include "memcpyTruncation.hpp"
+
 #include <stdio.h>
+#include <stdlib.h>
 
 #include <vector>
 
-#include "memcpyTruncation.hpp"
 #include "Tensor.hpp"
 
 namespace cytnx {
@@ -11,7 +12,8 @@ namespace cytnx {
 
     void memcpyTruncation_cd(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                              const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                             const bool &is_vT, const unsigned int &return_err) {
+                             const bool &is_vT, const unsigned int &return_err,
+                             const unsigned int &mindim) {
       // determine the truc_dim
       cytnx_uint64 Kdim = keepdim;
       cytnx_uint64 nums = S.storage().size();
@@ -20,7 +22,7 @@ namespace cytnx {
       }
       cytnx_uint64 truc_dim = Kdim;
       for (cytnx_int64 i = Kdim - 1; i >= 0; i--) {
-        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err) {
+        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err and truc_dim - 1 >= mindim) {
           truc_dim--;
         } else {
           break;
@@ -77,7 +79,8 @@ namespace cytnx {
 
     void memcpyTruncation_cf(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                              const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                             const bool &is_vT, const unsigned int &return_err) {
+                             const bool &is_vT, const unsigned int &return_err,
+                             const unsigned int &mindim) {
       // determine the truc_dim
       cytnx_uint64 Kdim = keepdim;
       cytnx_uint64 nums = S.storage().size();
@@ -86,7 +89,7 @@ namespace cytnx {
       }
       cytnx_uint64 truc_dim = Kdim;
       for (cytnx_int64 i = Kdim - 1; i >= 0; i--) {
-        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err) {
+        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err and truc_dim - 1 >= mindim) {
           truc_dim--;
         } else {
           break;
@@ -143,7 +146,8 @@ namespace cytnx {
 
     void memcpyTruncation_d(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                             const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                            const bool &is_vT, const unsigned int &return_err) {
+                            const bool &is_vT, const unsigned int &return_err,
+                            const unsigned int &mindim) {
       // determine the truc_dim
       cytnx_uint64 Kdim = keepdim;
       cytnx_uint64 nums = S.storage().size();
@@ -152,7 +156,7 @@ namespace cytnx {
       }
       cytnx_uint64 truc_dim = Kdim;
       for (cytnx_int64 i = Kdim - 1; i >= 0; i--) {
-        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err) {
+        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err and truc_dim - 1 >= mindim) {
           truc_dim--;
         } else {
           break;
@@ -209,7 +213,8 @@ namespace cytnx {
 
     void memcpyTruncation_f(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                             const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                            const bool &is_vT, const unsigned int &return_err) {
+                            const bool &is_vT, const unsigned int &return_err,
+                            const unsigned int &mindim) {
       // determine the truc_dim
       cytnx_uint64 Kdim = keepdim;
       cytnx_uint64 nums = S.storage().size();
@@ -218,7 +223,7 @@ namespace cytnx {
       }
       cytnx_uint64 truc_dim = Kdim;
       for (cytnx_int64 i = Kdim - 1; i >= 0; i--) {
-        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err) {
+        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err and truc_dim - 1 >= mindim) {
           truc_dim--;
         } else {
           break;
diff --git a/src/backend/linalg_internal_cpu/memcpyTruncation.hpp b/src/backend/linalg_internal_cpu/memcpyTruncation.hpp
index 8d2fc1700..24db37947 100644
--- a/src/backend/linalg_internal_cpu/memcpyTruncation.hpp
+++ b/src/backend/linalg_internal_cpu/memcpyTruncation.hpp
@@ -3,25 +3,30 @@
 
 #include <iostream>
 #include <vector>
+
+#include "Tensor.hpp"
 #include "Type.hpp"
 #include "cytnx_error.hpp"
-#include "Tensor.hpp"
 
 namespace cytnx {
   namespace linalg_internal {
 
     void memcpyTruncation_cd(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                              const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                             const bool &is_vT, const unsigned int &return_err);
+                             const bool &is_vT, const unsigned int &return_err,
+                             const unsigned int &mindim);
     void memcpyTruncation_cf(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                              const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                             const bool &is_vT, const unsigned int &return_err);
+                             const bool &is_vT, const unsigned int &return_err,
+                             const unsigned int &mindim);
     void memcpyTruncation_d(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                             const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                            const bool &is_vT, const unsigned int &return_err);
+                            const bool &is_vT, const unsigned int &return_err,
+                            const unsigned int &mindim);
     void memcpyTruncation_f(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                             const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                            const bool &is_vT, const unsigned int &return_err);
+                            const bool &is_vT, const unsigned int &return_err,
+                            const unsigned int &mindim);
 
   }  // namespace linalg_internal
 }  // namespace cytnx
diff --git a/src/backend/linalg_internal_gpu/cuAbs_internal.cu b/src/backend/linalg_internal_gpu/cuAbs_internal.cu
index 84536de27..5cf928151 100644
--- a/src/backend/linalg_internal_gpu/cuAbs_internal.cu
+++ b/src/backend/linalg_internal_gpu/cuAbs_internal.cu
@@ -1,8 +1,8 @@
 #include "cuAbs_internal.hpp"
 #include "../utils_internal_interface.hpp"
 
-//#include "cytnx_error.hpp"
-//#include "utils/backend/lapack_wrapper.hpp"
+// #include "cytnx_error.hpp"
+// #include "utils/backend/lapack_wrapper.hpp"
 
 #ifdef UNI_OMP
   #include <omp.h>
diff --git a/src/backend/linalg_internal_gpu/cuDiag_internal.cu b/src/backend/linalg_internal_gpu/cuDiag_internal.cu
index 45004e064..54fb69a6a 100644
--- a/src/backend/linalg_internal_gpu/cuDiag_internal.cu
+++ b/src/backend/linalg_internal_gpu/cuDiag_internal.cu
@@ -1,9 +1,9 @@
 #include "cuDiag_internal.hpp"
 #include "../utils_internal_interface.hpp"
 
-//#ifdef UNI_OMP
-//    #include <omp.h>
-//#endif
+// #ifdef UNI_OMP
+//     #include <omp.h>
+// #endif
 
 namespace cytnx {
 
diff --git a/src/backend/linalg_internal_gpu/cuExp_internal.cu b/src/backend/linalg_internal_gpu/cuExp_internal.cu
index dab940ed6..d1d1aee38 100644
--- a/src/backend/linalg_internal_gpu/cuExp_internal.cu
+++ b/src/backend/linalg_internal_gpu/cuExp_internal.cu
@@ -1,9 +1,9 @@
 #include "cuExp_internal.hpp"
 #include "../utils_internal_interface.hpp"
 
-//#ifdef UNI_OMP
-//    #include <omp.h>
-//#endif
+// #ifdef UNI_OMP
+//     #include <omp.h>
+// #endif
 
 namespace cytnx {
 
diff --git a/src/backend/linalg_internal_gpu/cuGeSvd_internal.hpp b/src/backend/linalg_internal_gpu/cuGeSvd_internal.hpp
index 4d89a4448..45378c0dd 100644
--- a/src/backend/linalg_internal_gpu/cuGeSvd_internal.hpp
+++ b/src/backend/linalg_internal_gpu/cuGeSvd_internal.hpp
@@ -9,7 +9,7 @@
 #include "Type.hpp"
 #include "cytnx_error.hpp"
 #include "backend/lapack_wrapper.hpp"
-//#include "../linalg_internal_interface.hpp"
+// #include "../linalg_internal_interface.hpp"
 namespace cytnx {
 
   namespace linalg_internal {
diff --git a/src/backend/linalg_internal_gpu/cuInv_inplace_internal.cu b/src/backend/linalg_internal_gpu/cuInv_inplace_internal.cu
index 0eabb5e10..d7f0c5d5f 100644
--- a/src/backend/linalg_internal_gpu/cuInv_inplace_internal.cu
+++ b/src/backend/linalg_internal_gpu/cuInv_inplace_internal.cu
@@ -1,9 +1,9 @@
 #include "cuInv_inplace_internal.hpp"
 #include "../utils_internal_interface.hpp"
 
-//#ifdef UNI_OMP
-//    #include <omp.h>
-//#endif
+// #ifdef UNI_OMP
+//     #include <omp.h>
+// #endif
 
 namespace cytnx {
 
diff --git a/src/backend/linalg_internal_gpu/cuPow_internal.cu b/src/backend/linalg_internal_gpu/cuPow_internal.cu
index 816fa3d98..71771d157 100644
--- a/src/backend/linalg_internal_gpu/cuPow_internal.cu
+++ b/src/backend/linalg_internal_gpu/cuPow_internal.cu
@@ -1,9 +1,9 @@
 #include "cuPow_internal.hpp"
 #include "../utils_internal_interface.hpp"
 
-//#ifdef UNI_OMP
-//    #include <omp.h>
-//#endif
+// #ifdef UNI_OMP
+//     #include <omp.h>
+// #endif
 
 namespace cytnx {
 
diff --git a/src/backend/linalg_internal_gpu/cuTensordot_internal.cu b/src/backend/linalg_internal_gpu/cuTensordot_internal.cu
index 4d89ca8b2..eb48fc2dd 100644
--- a/src/backend/linalg_internal_gpu/cuTensordot_internal.cu
+++ b/src/backend/linalg_internal_gpu/cuTensordot_internal.cu
@@ -14,14 +14,25 @@
       }                                                                          \
     };
 
+  #define HANDLE_CUDA_ERROR(x)                           \
+    {                                                    \
+      const auto err = x;                                \
+      if (err != cudaSuccess) {                          \
+        printf("Error in line %d: %s\n", __LINE__, err); \
+        exit(-1);                                        \
+      }                                                  \
+    };
+
 namespace cytnx {
 
   namespace linalg_internal {
 
     inline void _cuTensordot_internal(Tensor &out, const Tensor &Lin, const Tensor &Rin,
                                       const std::vector<cytnx_uint64> &idxl,
-                                      const std::vector<cytnx_uint64> &idxr, cudaDataType_t type,
-                                      cutensorComputeType_t typeCompute, void *alpha, void *beta) {
+                                      const std::vector<cytnx_uint64> &idxr,
+                                      cutensorDataType_t type,
+                                      cutensorComputeDescriptor_t descCompute, void *alpha,
+                                      void *beta) {
       // hostType alpha = (hostType){1.f, 0.f};
       // hostType beta = (hostType){0.f, 0.f};
 
@@ -90,94 +101,89 @@ namespace cytnx {
        * cuTENSOR
        *************************/
 
-      cutensorHandle_t *handle;
+      cutensorHandle_t handle;
       HANDLE_ERROR(cutensorCreate(&handle));
 
       /**********************
        * Create Tensor Descriptors
        **********************/
 
+      // This is the default alignment of cudaMalloc() and may also be the default alignment of
+      // cudaMallocManaged()
+      cytnx_uint64 defaultAlignment = 256;
       cutensorTensorDescriptor_t descL;
-      HANDLE_ERROR(cutensorInitTensorDescriptor(handle, &descL, nlabelL, extentL.data(),
-                                                NULL, /*stride*/
-                                                type, CUTENSOR_OP_IDENTITY));
+      HANDLE_ERROR(cutensorCreateTensorDescriptor(handle, &descL, nlabelL, extentL.data(),
+                                                  NULL, /*stride*/
+                                                  type, defaultAlignment));
 
       cutensorTensorDescriptor_t descR;
-      HANDLE_ERROR(cutensorInitTensorDescriptor(handle, &descR, nlabelR, extentR.data(),
-                                                NULL, /*stride*/
-                                                type, CUTENSOR_OP_IDENTITY));
+      HANDLE_ERROR(cutensorCreateTensorDescriptor(handle, &descR, nlabelR, extentR.data(),
+                                                  NULL, /*stride*/
+                                                  type, defaultAlignment));
 
       cutensorTensorDescriptor_t descOut;
-      HANDLE_ERROR(cutensorInitTensorDescriptor(handle, &descOut, nlabelOut, extentOut.data(),
-                                                NULL, /*stride*/
-                                                type, CUTENSOR_OP_IDENTITY));
-
-      /**********************************************
-       * Retrieve the memory alignment for each tensor
-       **********************************************/
-
-      uint32_t alignmentRequirementL;
-      HANDLE_ERROR(cutensorGetAlignmentRequirement(handle, lPtr, &descL, &alignmentRequirementL));
-
-      uint32_t alignmentRequirementR;
-      HANDLE_ERROR(cutensorGetAlignmentRequirement(handle, rPtr, &descR, &alignmentRequirementR));
-
-      uint32_t alignmentRequirementOut;
-      HANDLE_ERROR(
-        cutensorGetAlignmentRequirement(handle, outPtr, &descOut, &alignmentRequirementOut));
+      HANDLE_ERROR(cutensorCreateTensorDescriptor(handle, &descOut, nlabelOut, extentOut.data(),
+                                                  NULL, /*stride*/
+                                                  type, defaultAlignment));
 
       /*******************************
        * Create Contraction Descriptor
        *******************************/
 
-      cutensorContractionDescriptor_t desc;
-      HANDLE_ERROR(cutensorInitContractionDescriptor(
-        handle, &desc, &descL, labelL.data(), alignmentRequirementL, &descR, labelR.data(),
-        alignmentRequirementR, &descOut, labelOut.data(), alignmentRequirementOut, &descOut,
-        labelOut.data(), alignmentRequirementOut, typeCompute));
+      cutensorOperationDescriptor_t desc;
+      HANDLE_ERROR(
+        cutensorCreateContraction(handle, &desc, descL, labelL.data(), CUTENSOR_OP_IDENTITY, descR,
+                                  labelR.data(), CUTENSOR_OP_IDENTITY, descOut, labelOut.data(),
+                                  CUTENSOR_OP_IDENTITY, descOut, labelOut.data(), descCompute));
 
       /**************************
        * Set the algorithm to use
        ***************************/
 
-      cutensorContractionFind_t find;
-      HANDLE_ERROR(cutensorInitContractionFind(handle, &find, CUTENSOR_ALGO_DEFAULT));
+      cutensorPlanPreference_t planPref;
+      cutensorCreatePlanPreference(handle, &planPref, CUTENSOR_ALGO_DEFAULT,
+                                   CUTENSOR_JIT_MODE_NONE);
 
       /**********************
        * Query workspace
        **********************/
 
-      uint64_t worksize = 0;
-      HANDLE_ERROR(cutensorContractionGetWorkspaceSize(handle, &desc, &find,
-                                                       CUTENSOR_WORKSPACE_RECOMMENDED, &worksize));
-
-      void *work = nullptr;
-      if (worksize > 0) {
-        if (cudaSuccess != cudaMalloc(&work, worksize)) {
-          work = nullptr;
-          worksize = 0;
-        }
-      }
+      uint64_t workspaceSizeEstimate = 0;
+      cutensorEstimateWorkspaceSize(handle, desc, planPref, CUTENSOR_WORKSPACE_DEFAULT,
+                                    &workspaceSizeEstimate);
 
       /**************************
        * Create Contraction Plan
        **************************/
 
-      cutensorContractionPlan_t plan;
-      HANDLE_ERROR(cutensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
+      cutensorPlan_t plan;
+      cutensorCreatePlan(handle, &plan, desc, planPref, workspaceSizeEstimate);
+
+      uint64_t actualWorkspaceSize = 0;
+      cutensorPlanGetAttribute(handle, plan, CUTENSOR_PLAN_REQUIRED_WORKSPACE, &actualWorkspaceSize,
+                               sizeof(actualWorkspaceSize));
+
+      void *work = nullptr;
+      if (actualWorkspaceSize > 0) HANDLE_CUDA_ERROR(cudaMalloc(&work, actualWorkspaceSize));
 
       /**********************
        * Run
        **********************/
 
-      HANDLE_ERROR(cutensorContraction(handle, &plan, alpha, lPtr, rPtr, beta, outPtr, outPtr, work,
-                                       worksize, 0 /* stream */));
+      cutensorContract(handle, plan, (void *)alpha, lPtr, rPtr, (void *)beta, outPtr, outPtr, work,
+                       actualWorkspaceSize, /* stream */ 0);
 
       /*************************/
 
       cudaDeviceSynchronize();
 
       if (work) checkCudaErrors(cudaFree(work));
+      // XXX: Can the OperationDescriptor be destories before running?
+      HANDLE_ERROR(cutensorDestroyTensorDescriptor(descL));
+      HANDLE_ERROR(cutensorDestroyTensorDescriptor(descR));
+      HANDLE_ERROR(cutensorDestroyTensorDescriptor(descOut));
+      HANDLE_ERROR(cutensorDestroyPlanPreference(planPref));
+      HANDLE_ERROR(cutensorDestroyPlan(plan));
       HANDLE_ERROR(cutensorDestroy(handle));
     }
 
@@ -185,80 +191,52 @@ namespace cytnx {
                                  const std::vector<cytnx_uint64> &idxl,
                                  const std::vector<cytnx_uint64> &idxr) {
       typedef cuDoubleComplex hostType;
-      cudaDataType_t type = CUDA_C_64F;
-      cutensorComputeType_t typeCompute = CUTENSOR_COMPUTE_64F;
+      cutensorDataType_t type = CUTENSOR_C_64F;
+      cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_64F;
 
       hostType alpha = {1., 0.};
       hostType beta = {0., 0.};
 
-      _cuTensordot_internal(out, Lin, Rin, idxl, idxr, type, typeCompute, &alpha, &beta);
+      _cuTensordot_internal(out, Lin, Rin, idxl, idxr, type, descCompute, &alpha, &beta);
     }
 
     void cuTensordot_internal_cf(Tensor &out, const Tensor &Lin, const Tensor &Rin,
                                  const std::vector<cytnx_uint64> &idxl,
                                  const std::vector<cytnx_uint64> &idxr) {
       typedef cuFloatComplex hostType;
-      cudaDataType_t type = CUDA_C_32F;
-      cutensorComputeType_t typeCompute = CUTENSOR_COMPUTE_32F;
+      cutensorDataType_t type = CUTENSOR_C_32F;
+      cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;
 
       hostType alpha = {1.f, 0.f};
       hostType beta = {0.f, 0.f};
 
-      _cuTensordot_internal(out, Lin, Rin, idxl, idxr, type, typeCompute, &alpha, &beta);
+      _cuTensordot_internal(out, Lin, Rin, idxl, idxr, type, descCompute, &alpha, &beta);
     }
 
     void cuTensordot_internal_d(Tensor &out, const Tensor &Lin, const Tensor &Rin,
                                 const std::vector<cytnx_uint64> &idxl,
                                 const std::vector<cytnx_uint64> &idxr) {
       typedef double hostType;
-      cudaDataType_t type = CUDA_R_64F;
-      cutensorComputeType_t typeCompute = CUTENSOR_COMPUTE_64F;
+      cutensorDataType_t type = CUTENSOR_R_64F;
+      cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_64F;
 
       hostType alpha = 1.f;
       hostType beta = 0.f;
 
-      _cuTensordot_internal(out, Lin, Rin, idxl, idxr, type, typeCompute, &alpha, &beta);
+      _cuTensordot_internal(out, Lin, Rin, idxl, idxr, type, descCompute, &alpha, &beta);
     }
 
     void cuTensordot_internal_f(Tensor &out, const Tensor &Lin, const Tensor &Rin,
                                 const std::vector<cytnx_uint64> &idxl,
                                 const std::vector<cytnx_uint64> &idxr) {
       typedef float hostType;
-      cudaDataType_t type = CUDA_R_32F;
-      cutensorComputeType_t typeCompute = CUTENSOR_COMPUTE_32F;
+      cutensorDataType_t type = CUTENSOR_R_32F;
+      cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;
 
       hostType alpha = 1.f;
       hostType beta = 0.f;
 
-      _cuTensordot_internal(out, Lin, Rin, idxl, idxr, type, typeCompute, &alpha, &beta);
-    }
-
-    void cuTensordot_internal_u32(Tensor &out, const Tensor &Lin, const Tensor &Rin,
-                                  const std::vector<cytnx_uint64> &idxl,
-                                  const std::vector<cytnx_uint64> &idxr) {
-      typedef cytnx_uint32 hostType;
-      cudaDataType_t type = CUDA_R_32U;
-
-      cutensorComputeType_t typeCompute = CUTENSOR_COMPUTE_32U;
-
-      hostType alpha = 1;
-      hostType beta = 0;
-
-      _cuTensordot_internal(out, Lin, Rin, idxl, idxr, type, typeCompute, &alpha, &beta);
-    }
-
-    void cuTensordot_internal_i32(Tensor &out, const Tensor &Lin, const Tensor &Rin,
-                                  const std::vector<cytnx_uint64> &idxl,
-                                  const std::vector<cytnx_uint64> &idxr) {
-      typedef cytnx_int32 hostType;
-      cudaDataType_t type = CUDA_R_32I;
-
-      cutensorComputeType_t typeCompute = CUTENSOR_COMPUTE_32I;
-
-      hostType alpha = 1;
-      hostType beta = 0;
-
-      _cuTensordot_internal(out, Lin, Rin, idxl, idxr, type, typeCompute, &alpha, &beta);
+      _cuTensordot_internal(out, Lin, Rin, idxl, idxr, type, descCompute, &alpha, &beta);
     }
 
   }  // namespace linalg_internal
diff --git a/src/backend/linalg_internal_gpu/cuTensordot_internal.hpp b/src/backend/linalg_internal_gpu/cuTensordot_internal.hpp
index bb81bc6e4..58c8aac6d 100644
--- a/src/backend/linalg_internal_gpu/cuTensordot_internal.hpp
+++ b/src/backend/linalg_internal_gpu/cuTensordot_internal.hpp
@@ -30,14 +30,6 @@ namespace cytnx {
                                 const std::vector<cytnx_uint64> &idxl,
                                 const std::vector<cytnx_uint64> &idxr);
 
-    void cuTensordot_internal_u32(Tensor &out, const Tensor &Lin, const Tensor &Rin,
-                                  const std::vector<cytnx_uint64> &idxl,
-                                  const std::vector<cytnx_uint64> &idxr);
-
-    void cuTensordot_internal_i32(Tensor &out, const Tensor &Lin, const Tensor &Rin,
-                                  const std::vector<cytnx_uint64> &idxl,
-                                  const std::vector<cytnx_uint64> &idxr);
-
   }  // namespace linalg_internal
 }  // namespace cytnx
 
diff --git a/src/backend/linalg_internal_gpu/cudaMemcpyTruncation.cu b/src/backend/linalg_internal_gpu/cudaMemcpyTruncation.cu
index 7abd656fb..df2fb47ca 100644
--- a/src/backend/linalg_internal_gpu/cudaMemcpyTruncation.cu
+++ b/src/backend/linalg_internal_gpu/cudaMemcpyTruncation.cu
@@ -31,7 +31,8 @@ namespace cytnx {
 #ifdef UNI_GPU
     void cudaMemcpyTruncation_cd(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                                  const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                                 const bool &is_vT, const unsigned int &return_err) {
+                                 const bool &is_vT, const unsigned int &return_err,
+                                 const unsigned int &mindim) {
       // determine the truc_dim
       cytnx_uint64 Kdim = keepdim;
       cytnx_uint64 nums = S.storage().size();
@@ -40,7 +41,7 @@ namespace cytnx {
       }
       cytnx_uint64 truc_dim = Kdim;
       for (cytnx_int64 i = Kdim - 1; i >= 0; i--) {
-        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err) {
+        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err and truc_dim - 1 >= mindim) {
           truc_dim--;
         } else {
           break;
@@ -101,7 +102,8 @@ namespace cytnx {
 
     void cudaMemcpyTruncation_cf(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                                  const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                                 const bool &is_vT, const unsigned int &return_err) {
+                                 const bool &is_vT, const unsigned int &return_err,
+                                 const unsigned int &mindim) {
       // determine the truc_dim
       cytnx_uint64 Kdim = keepdim;
       cytnx_uint64 nums = S.storage().size();
@@ -110,7 +112,7 @@ namespace cytnx {
       }
       cytnx_uint64 truc_dim = Kdim;
       for (cytnx_int64 i = Kdim - 1; i >= 0; i--) {
-        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err) {
+        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err and truc_dim - 1 >= mindim) {
           truc_dim--;
         } else {
           break;
@@ -171,7 +173,8 @@ namespace cytnx {
 
     void cudaMemcpyTruncation_d(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                                 const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                                const bool &is_vT, const unsigned int &return_err) {
+                                const bool &is_vT, const unsigned int &return_err,
+                                const unsigned int &mindim) {
       // determine the truc_dim
       cytnx_uint64 Kdim = keepdim;
       cytnx_uint64 nums = S.storage().size();
@@ -180,7 +183,7 @@ namespace cytnx {
       }
       cytnx_uint64 truc_dim = Kdim;
       for (cytnx_int64 i = Kdim - 1; i >= 0; i--) {
-        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err) {
+        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err and truc_dim - 1 >= mindim) {
           truc_dim--;
         } else {
           break;
@@ -241,7 +244,8 @@ namespace cytnx {
 
     void cudaMemcpyTruncation_f(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                                 const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                                const bool &is_vT, const unsigned int &return_err) {
+                                const bool &is_vT, const unsigned int &return_err,
+                                const unsigned int &mindim) {
       // determine the truc_dim
       cytnx_uint64 Kdim = keepdim;
       cytnx_uint64 nums = S.storage().size();
@@ -250,7 +254,7 @@ namespace cytnx {
       }
       cytnx_uint64 truc_dim = Kdim;
       for (cytnx_int64 i = Kdim - 1; i >= 0; i--) {
-        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err) {
+        if (((cytnx_double *)S._impl->storage()._impl->Mem)[i] < err and truc_dim - 1 >= mindim) {
           truc_dim--;
         } else {
           break;
diff --git a/src/backend/linalg_internal_gpu/cudaMemcpyTruncation.hpp b/src/backend/linalg_internal_gpu/cudaMemcpyTruncation.hpp
index 8cdb1b961..8297b6317 100644
--- a/src/backend/linalg_internal_gpu/cudaMemcpyTruncation.hpp
+++ b/src/backend/linalg_internal_gpu/cudaMemcpyTruncation.hpp
@@ -3,9 +3,10 @@
 
 #include <iostream>
 #include <vector>
+
+#include "Tensor.hpp"
 #include "Type.hpp"
 #include "cytnx_error.hpp"
-#include "Tensor.hpp"
 
 namespace cytnx {
   namespace linalg_internal {
@@ -14,16 +15,20 @@ namespace cytnx {
     /// cuSvd
     void cudaMemcpyTruncation_cd(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                                  const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                                 const bool &is_vT, const unsigned int &return_err);
+                                 const bool &is_vT, const unsigned int &return_err,
+                                 const unsigned int &mindim);
     void cudaMemcpyTruncation_cf(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                                  const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                                 const bool &is_vT, const unsigned int &return_err);
+                                 const bool &is_vT, const unsigned int &return_err,
+                                 const unsigned int &mindim);
     void cudaMemcpyTruncation_d(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                                 const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                                const bool &is_vT, const unsigned int &return_err);
+                                const bool &is_vT, const unsigned int &return_err,
+                                const unsigned int &mindim);
     void cudaMemcpyTruncation_f(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                                 const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                                const bool &is_vT, const unsigned int &return_err);
+                                const bool &is_vT, const unsigned int &return_err,
+                                const unsigned int &mindim);
 #endif
 
   }  // namespace linalg_internal
diff --git a/src/backend/linalg_internal_interface.cpp b/src/backend/linalg_internal_interface.cpp
index 901d52527..0ef1e356a 100644
--- a/src/backend/linalg_internal_interface.cpp
+++ b/src/backend/linalg_internal_interface.cpp
@@ -1403,8 +1403,6 @@ namespace cytnx {
       cuTensordot_ii[Type.ComplexFloat] = cuTensordot_internal_cf;
       cuTensordot_ii[Type.Double] = cuTensordot_internal_d;
       cuTensordot_ii[Type.Float] = cuTensordot_internal_f;
-      cuTensordot_ii[Type.Int32] = cuTensordot_internal_i32;
-      cuTensordot_ii[Type.Uint32] = cuTensordot_internal_u32;
   #endif
 #endif
     }
diff --git a/src/backend/linalg_internal_interface.hpp b/src/backend/linalg_internal_interface.hpp
index 2a6c83825..806790b18 100644
--- a/src/backend/linalg_internal_interface.hpp
+++ b/src/backend/linalg_internal_interface.hpp
@@ -4,67 +4,66 @@
 #include <vector>
 
 #include "Type.hpp"
-#include "backend/Storage.hpp"
 #include "backend/Scalar.hpp"
+#include "backend/Storage.hpp"
+#include "linalg_internal_cpu/Abs_internal.hpp"
 #include "linalg_internal_cpu/Arithmetic_internal.hpp"
-#include "linalg_internal_cpu/iArithmetic_internal.hpp"
-#include "linalg_internal_cpu/Sdd_internal.hpp"
-#include "linalg_internal_cpu/Gesvd_internal.hpp"
-#include "linalg_internal_cpu/Eigh_internal.hpp"
+#include "linalg_internal_cpu/Axpy_internal.hpp"
+#include "linalg_internal_cpu/Conj_inplace_internal.hpp"
+#include "linalg_internal_cpu/Det_internal.hpp"
+#include "linalg_internal_cpu/Diag_internal.hpp"
 #include "linalg_internal_cpu/Eig_internal.hpp"
+#include "linalg_internal_cpu/Eigh_internal.hpp"
+#include "linalg_internal_cpu/Exp_internal.hpp"
+#include "linalg_internal_cpu/Gemm_Batch_internal.hpp"
+#include "linalg_internal_cpu/Gemm_internal.hpp"
+#include "linalg_internal_cpu/Ger_internal.hpp"
+#include "linalg_internal_cpu/Gesvd_internal.hpp"
 #include "linalg_internal_cpu/InvM_inplace_internal.hpp"
 #include "linalg_internal_cpu/Inv_inplace_internal.hpp"
-#include "linalg_internal_cpu/Conj_inplace_internal.hpp"
-#include "linalg_internal_cpu/Exp_internal.hpp"
-#include "linalg_internal_cpu/Matmul_internal.hpp"
-#include "linalg_internal_cpu/Matmul_dg_internal.hpp"
-#include "linalg_internal_cpu/Diag_internal.hpp"
-#include "linalg_internal_cpu/Outer_internal.hpp"
 #include "linalg_internal_cpu/Kron_internal.hpp"
-#include "linalg_internal_cpu/Vectordot_internal.hpp"
-#include "linalg_internal_cpu/Tridiag_internal.hpp"
-#include "linalg_internal_cpu/Norm_internal.hpp"
+#include "linalg_internal_cpu/Lstsq_internal.hpp"
+#include "linalg_internal_cpu/Matmul_dg_internal.hpp"
+#include "linalg_internal_cpu/Matmul_internal.hpp"
 #include "linalg_internal_cpu/Matvec_internal.hpp"
+#include "linalg_internal_cpu/MaxMin_internal.hpp"
+#include "linalg_internal_cpu/Norm_internal.hpp"
+#include "linalg_internal_cpu/Outer_internal.hpp"
 #include "linalg_internal_cpu/Pow_internal.hpp"
-#include "linalg_internal_cpu/Abs_internal.hpp"
 #include "linalg_internal_cpu/QR_internal.hpp"
-#include "linalg_internal_cpu/MaxMin_internal.hpp"
+#include "linalg_internal_cpu/Sdd_internal.hpp"
 #include "linalg_internal_cpu/Sum_internal.hpp"
-#include "linalg_internal_cpu/Det_internal.hpp"
-#include "linalg_internal_cpu/Lstsq_internal.hpp"
-#include "linalg_internal_cpu/Axpy_internal.hpp"
-#include "linalg_internal_cpu/Ger_internal.hpp"
-#include "linalg_internal_cpu/Gemm_internal.hpp"
-#include "linalg_internal_cpu/Gemm_Batch_internal.hpp"
 #include "linalg_internal_cpu/Trace_internal.hpp"
-
+#include "linalg_internal_cpu/Tridiag_internal.hpp"
+#include "linalg_internal_cpu/Vectordot_internal.hpp"
+#include "linalg_internal_cpu/iArithmetic_internal.hpp"
 #include "linalg_internal_cpu/memcpyTruncation.hpp"
 
 #ifdef UNI_GPU
-  #include "linalg_internal_gpu/cuArithmetic_internal.hpp"
   #include "linalg_internal_gpu/cuAbs_internal.hpp"
-  #include "linalg_internal_gpu/cuSvd_internal.hpp"
-  #include "linalg_internal_gpu/cuGeSvd_internal.hpp"
-  #include "linalg_internal_gpu/cuEigh_internal.hpp"
-  #include "linalg_internal_gpu/cuInvM_inplace_internal.hpp"
-  #include "linalg_internal_gpu/cuInv_inplace_internal.hpp"
+  #include "linalg_internal_gpu/cuArithmetic_internal.hpp"
   #include "linalg_internal_gpu/cuConj_inplace_internal.hpp"
+  #include "linalg_internal_gpu/cuDet_internal.hpp"
+  #include "linalg_internal_gpu/cuDiag_internal.hpp"
+  #include "linalg_internal_gpu/cuEigh_internal.hpp"
   #include "linalg_internal_gpu/cuExp_internal.hpp"
-  #include "linalg_internal_gpu/cuGemm_internal.hpp"
+  #include "linalg_internal_gpu/cuGeSvd_internal.hpp"
   #include "linalg_internal_gpu/cuGemm_Batch_internal.hpp"
-  #include "linalg_internal_gpu/cuMatmul_internal.hpp"
+  #include "linalg_internal_gpu/cuGemm_internal.hpp"
+  #include "linalg_internal_gpu/cuGer_internal.hpp"
+  #include "linalg_internal_gpu/cuInvM_inplace_internal.hpp"
+  #include "linalg_internal_gpu/cuInv_inplace_internal.hpp"
+  #include "linalg_internal_gpu/cuKron_internal.hpp"
   #include "linalg_internal_gpu/cuMatmul_dg_internal.hpp"
-  #include "linalg_internal_gpu/cuDiag_internal.hpp"
-  #include "linalg_internal_gpu/cuOuter_internal.hpp"
-  #include "linalg_internal_gpu/cuNorm_internal.hpp"
+  #include "linalg_internal_gpu/cuMatmul_internal.hpp"
   #include "linalg_internal_gpu/cuMatvec_internal.hpp"
-  #include "linalg_internal_gpu/cuVectordot_internal.hpp"
+  #include "linalg_internal_gpu/cuMaxMin_internal.hpp"
+  #include "linalg_internal_gpu/cuNorm_internal.hpp"
+  #include "linalg_internal_gpu/cuOuter_internal.hpp"
   #include "linalg_internal_gpu/cuPow_internal.hpp"
-  #include "linalg_internal_gpu/cuGer_internal.hpp"
-  #include "linalg_internal_gpu/cuDet_internal.hpp"
   #include "linalg_internal_gpu/cuSum_internal.hpp"
-  #include "linalg_internal_gpu/cuMaxMin_internal.hpp"
-  #include "linalg_internal_gpu/cuKron_internal.hpp"
+  #include "linalg_internal_gpu/cuSvd_internal.hpp"
+  #include "linalg_internal_gpu/cuVectordot_internal.hpp"
   #include "linalg_internal_gpu/cudaMemcpyTruncation.hpp"
   #ifdef UNI_CUTENSOR
     #include "linalg_internal_gpu/cuTensordot_internal.hpp"
@@ -191,14 +190,16 @@ namespace cytnx {
     typedef void (*memcpyTruncation_oii)(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                                          const cytnx_uint64 &keepdim, const double &err,
                                          const bool &is_U, const bool &is_vT,
-                                         const unsigned int &return_err);
+                                         const unsigned int &return_err,
+                                         const unsigned int &mindim);
 
 #ifdef UNI_GPU
 
     typedef void (*cudaMemcpyTruncation_oii)(Tensor &U, Tensor &vT, Tensor &S, Tensor &terr,
                                              const cytnx_uint64 &keepdim, const double &err,
                                              const bool &is_U, const bool &is_vT,
-                                             const unsigned int &return_err);
+                                             const unsigned int &return_err,
+                                             const unsigned int &mindim);
 
   #ifdef UNI_CUQUANTUM
     typedef void (*cuQuantumGeSvd_oii)(const Tensor &Tin, const cytnx_uint64 &keepdim,
diff --git a/src/backend/utils_internal_gpu/cuMovemem_gpu.cu b/src/backend/utils_internal_gpu/cuMovemem_gpu.cu
index 06d17b00a..413d7bf04 100644
--- a/src/backend/utils_internal_gpu/cuMovemem_gpu.cu
+++ b/src/backend/utils_internal_gpu/cuMovemem_gpu.cu
@@ -201,8 +201,8 @@ namespace cytnx {
     boost::intrusive_ptr<Storage_base> cuMovemem_cutensor_gpu(
       boost::intrusive_ptr<Storage_base> &in, const std::vector<cytnx_uint64> &old_shape,
       const std::vector<cytnx_uint64> &mapper, const std::vector<cytnx_uint64> &invmapper,
-      const bool is_inplace, cudaDataType_t type_in, cudaDataType_t type_out,
-      cudaDataType_t type_one, const cuT &ONE) {
+      const bool is_inplace, cutensorDataType_t type_in, cutensorDataType_t type_out,
+      const cutensorComputeDescriptor_t descCompute, const cuT &ONE) {
       T proxy;
       unsigned int dtype_T = Type_class::cy_typeid(proxy);
     #ifdef UNI_DEBUG
@@ -233,20 +233,41 @@ namespace cytnx {
       std::reverse(ori.begin(), ori.end());  // matching API
 
       cutensorHandle_t handle;
-      checkCudaErrors(cutensorInit(&handle));
+      checkCudaErrors(cutensorCreate(&handle));
 
+      // This is the default alignment of cudaMalloc() and may also be the default alignment of
+      // cudaMallocManaged()
+      cytnx_uint64 defaultAlignment = 256;
       cutensorTensorDescriptor_t descA;
-      checkCudaErrors(cutensorInitTensorDescriptor(&handle, &descA, size.size(), size.data(),
-                                                   NULL /* stride */, type_in,
-                                                   CUTENSOR_OP_IDENTITY));
+      checkCudaErrors(cutensorCreateTensorDescriptor(handle, &descA, size.size(), size.data(),
+                                                     NULL /* stride */, type_in, defaultAlignment));
 
       cutensorTensorDescriptor_t descC;
-      checkCudaErrors(cutensorInitTensorDescriptor(&handle, &descC, new_size.size(),
-                                                   new_size.data(), NULL /* stride */, type_out,
-                                                   CUTENSOR_OP_IDENTITY));
+      checkCudaErrors(cutensorCreateTensorDescriptor(handle, &descC, new_size.size(),
+                                                     new_size.data(), NULL /* stride */, type_out,
+                                                     defaultAlignment));
+      // TODO: verify the type of ONE matches descCompute
+      cutensorOperationDescriptor_t desc;
+      checkCudaErrors(cutensorCreatePermutation(
+        handle, &desc, descA, ori.data(), CUTENSOR_OP_IDENTITY, descC, perm.data(), descCompute));
 
-      checkCudaErrors(cutensorPermutation(&handle, &ONE, (cuT *)in->Mem, &descA, ori.data(), dtmp,
-                                          &descC, perm.data(), type_one, 0 /* stream */));
+      const cutensorAlgo_t algo = CUTENSOR_ALGO_DEFAULT;
+
+      cutensorPlanPreference_t planPref;
+      checkCudaErrors(
+        cutensorCreatePlanPreference(handle, &planPref, algo, CUTENSOR_JIT_MODE_NONE));
+
+      cutensorPlan_t plan;
+      checkCudaErrors(
+        cutensorCreatePlan(handle, &plan, desc, planPref, 0 /* workspaceSizeLimit */));
+
+      checkCudaErrors(cutensorPermute(handle, plan, &ONE, (cuT *)in->Mem, dtmp, 0 /* stream */));
+
+      checkCudaErrors(cutensorDestroyTensorDescriptor(descA));
+      checkCudaErrors(cutensorDestroyTensorDescriptor(descC));
+      checkCudaErrors(cutensorDestroyPlanPreference(planPref));
+      checkCudaErrors(cutensorDestroyPlan(plan));
+      checkCudaErrors(cutensorDestroy(handle));
 
       boost::intrusive_ptr<Storage_base> out = __SII.USIInit[dtype_T]();
       if (is_inplace) {
@@ -269,8 +290,8 @@ namespace cytnx {
                                                         const bool is_inplace) {
   #ifdef UNI_CUTENSOR
       return cuMovemem_cutensor_gpu<cytnx_complex128, cuDoubleComplex>(
-        in, old_shape, mapper, invmapper, is_inplace, CUDA_C_64F, CUDA_C_64F, CUDA_C_64F,
-        make_cuDoubleComplex(1, 0));
+        in, old_shape, mapper, invmapper, is_inplace, CUTENSOR_C_64F, CUTENSOR_C_64F,
+        CUTENSOR_COMPUTE_DESC_64F, make_cuDoubleComplex(1, 0));
   #elif defined(UNI_CUTT)
       return cuMovemem_cutt_gpu<cytnx_complex128, cuDoubleComplex>(in, old_shape, mapper, invmapper,
                                                                    is_inplace);
@@ -287,8 +308,8 @@ namespace cytnx {
                                                         const bool is_inplace) {
   #if defined(UNI_CUTENSOR)
       return cuMovemem_cutensor_gpu<cytnx_complex64, cuFloatComplex>(
-        in, old_shape, mapper, invmapper, is_inplace, CUDA_C_32F, CUDA_C_32F, CUDA_C_32F,
-        make_cuFloatComplex(1, 0));
+        in, old_shape, mapper, invmapper, is_inplace, CUTENSOR_C_32F, CUTENSOR_C_32F,
+        CUTENSOR_COMPUTE_DESC_32F, make_cuFloatComplex(1, 0));
   #elif defined(UNI_CUTT)
       return cuMovemem_cutt_gpu<cytnx_complex64, cuFloatComplex>(in, old_shape, mapper, invmapper,
                                                                  is_inplace);
@@ -305,7 +326,8 @@ namespace cytnx {
                                                        const bool is_inplace) {
   #if defined(UNI_CUTENSOR)
       return cuMovemem_cutensor_gpu<double, double>(in, old_shape, mapper, invmapper, is_inplace,
-                                                    CUDA_R_64F, CUDA_R_64F, CUDA_R_64F, double(1));
+                                                    CUTENSOR_R_64F, CUTENSOR_R_64F,
+                                                    CUTENSOR_COMPUTE_DESC_64F, double(1));
   #elif defined(UNI_CUTT)
       return cuMovemem_cutt_gpu<cytnx_double, cytnx_double>(in, old_shape, mapper, invmapper,
                                                             is_inplace);
@@ -321,7 +343,8 @@ namespace cytnx {
                                                        const bool is_inplace) {
   #if defined(UNI_CUTENSOR)
       return cuMovemem_cutensor_gpu<float, float>(in, old_shape, mapper, invmapper, is_inplace,
-                                                  CUDA_R_32F, CUDA_R_32F, CUDA_R_32F, float(1));
+                                                  CUTENSOR_R_32F, CUTENSOR_R_32F,
+                                                  CUTENSOR_COMPUTE_DESC_32F, float(1));
   #elif defined(UNI_CUTT)
       return cuMovemem_cutt_gpu<cytnx_float, cytnx_float>(in, old_shape, mapper, invmapper,
                                                           is_inplace);
diff --git a/src/linalg/Arnoldi.cpp b/src/linalg/Arnoldi.cpp
index c4c202b56..7437d8945 100644
--- a/src/linalg/Arnoldi.cpp
+++ b/src/linalg/Arnoldi.cpp
@@ -12,9 +12,13 @@
 namespace cytnx {
   namespace linalg {
     typedef Accessor ac;
+    // <A|B>
+    static Scalar _Dot(const UniTensor &A, const UniTensor &B) {
+      return Contract(A.Dagger(), B).item();
+    }
 
     // resize the matrix (2-rank tensor)
-    static Tensor ResizeMat(const Tensor &src, const cytnx_uint64 r, const cytnx_uint64 c) {
+    static Tensor _resize_mat(const Tensor &src, const cytnx_uint64 r, const cytnx_uint64 c) {
       const auto min_r = std::min(r, src.shape()[0]);
       const auto min_c = std::min(c, src.shape()[1]);
       // Tensor dst = src[{ac::range(0,min_r),ac::range(0,min_c)}];
@@ -32,8 +36,9 @@ namespace cytnx {
     }
 
     // Get the indices of the first few order element
-    std::vector<cytnx_int32> GetFstFewOrderElemIdices(const Tensor &tens, const std::string &which,
-                                                      const cytnx_int64 k) {
+    std::vector<cytnx_int32> _get_fst_few_order_elem_indices(const Tensor &tens,
+                                                             const std::string &which,
+                                                             const cytnx_int64 k) {
       char large_or_small = which[0];  //'S' or 'L'
       char metric_type = which[1];  //'M', 'R' or 'I'
 
@@ -64,8 +69,8 @@ namespace cytnx {
       return indices;
     }
 
-    bool IsEigvalCvg(const std::vector<Scalar> &eigvals, const std::vector<Scalar> &eigvals_old,
-                     const double cvg_crit) {
+    bool _is_eigval_cvg(const std::vector<Scalar> &eigvals, const std::vector<Scalar> &eigvals_old,
+                        const double cvg_crit) {
       for (cytnx_int32 i = 0; i < eigvals.size(); ++i) {
         auto err = abs(eigvals[i] - eigvals_old[i]);
         if (err >= cvg_crit) return false;
@@ -74,8 +79,19 @@ namespace cytnx {
     }
 
     // check the residule |Mv - ev| is converged.
-    bool IsResiduleSmallEnough(LinOp *Hop, const std::vector<Tensor> &eigvecs,
-                               const std::vector<Scalar> &eigvals, const double cvg_crit) {
+    bool _is_residule_small_enough(LinOp *Hop, const std::vector<Tensor> &eigvecs,
+                                   const std::vector<Scalar> &eigvals, const double cvg_crit) {
+      for (cytnx_int32 i = 0; i < eigvals.size(); ++i) {
+        auto eigvec = eigvecs[i];
+        auto eigval = eigvals[i];
+        auto resi = (Hop->matvec(eigvec) - eigval * eigvec).Norm().item();
+        if (resi >= cvg_crit) return false;
+      }
+      return true;
+    }
+
+    bool _is_residule_small_enough(LinOp *Hop, const std::vector<UniTensor> &eigvecs,
+                                   const std::vector<Scalar> &eigvals, const double cvg_crit) {
       for (cytnx_int32 i = 0; i < eigvals.size(); ++i) {
         auto eigvec = eigvecs[i];
         auto eigval = eigvals[i];
@@ -85,8 +101,8 @@ namespace cytnx {
       return true;
     }
 
-    std::vector<Tensor> GetEigTens(const std::vector<Tensor> qs, Tensor eigvec_in_kryv,
-                                   std::vector<cytnx_int32> max_indices) {
+    std::vector<Tensor> _get_eig_tens(const std::vector<Tensor> &qs, Tensor eigvec_in_kryv,
+                                      std::vector<cytnx_int32> max_indices) {
       auto k = max_indices.size();
       cytnx_int64 krydim = eigvec_in_kryv.shape()[0];
       auto P_inv = InvM(eigvec_in_kryv).Conj();
@@ -95,7 +111,26 @@ namespace cytnx {
         auto maxIdx = max_indices[ik];
         auto eigTens = zeros(qs[0].shape(), Type.ComplexDouble);
         for (cytnx_int64 i = 0; i < krydim; ++i) {
-          eigTens += P_inv[{i, maxIdx}] * qs[i];
+          eigTens += P_inv[{i, static_cast<cytnx_int64>(maxIdx)}] * qs[i];
+        }
+        eigTens /= eigTens.Norm().item();
+        eigTens_s[ik] = eigTens;
+      }
+      return eigTens_s;
+    }
+
+    std::vector<UniTensor> _get_eig_tens(const std::vector<UniTensor> &qs, Tensor eigvec_in_kryv,
+                                         std::vector<cytnx_int32> max_indices) {
+      auto k = max_indices.size();
+      cytnx_int64 krydim = eigvec_in_kryv.shape()[0];
+      auto P_inv = InvM(eigvec_in_kryv).Conj();
+      auto eigTens_s = std::vector<UniTensor>(k, UniTensor());
+      for (cytnx_int32 ik = 0; ik < k; ++ik) {
+        auto maxIdx = max_indices[ik];
+        auto eigTens = P_inv.at({0, static_cast<cytnx_uint64>(maxIdx)}) * qs[0];
+        for (cytnx_int64 i = 1; i < krydim; ++i) {
+          eigTens +=
+            P_inv.at({static_cast<cytnx_uint64>(i), static_cast<cytnx_uint64>(maxIdx)}) * qs[i];
         }
         eigTens /= eigTens.Norm().item();
         eigTens_s[ik] = eigTens;
@@ -134,22 +169,22 @@ namespace cytnx {
         auto h = buffer[i].Norm().item();
         kry_mat_buffer[{i - 1, i}] = h;
         buffer[i] /= h;
-        Tensor kry_mat = ResizeMat(kry_mat_buffer, krydim, krydim);
+        Tensor kry_mat = _resize_mat(kry_mat_buffer, krydim, krydim);
 
         // call Eig to get eigenvalues
         auto eigs = Eig(kry_mat, true, true);
         // get first few order of eigenvlues
-        std::vector<cytnx_int32> maxIndices = GetFstFewOrderElemIdices(eigs[0], which, k);
+        std::vector<cytnx_int32> maxIndices = _get_fst_few_order_elem_indices(eigs[0], which, k);
         for (cytnx_int32 ik = 0; ik < k; ++ik) {
           auto maxIdx = maxIndices[ik];
           eigvals[ik] = eigs[0].storage().at(maxIdx);
         }
 
         // check converged
-        bool is_eigval_cvg = IsEigvalCvg(eigvals, eigvals_old, CvgCrit);
+        bool is_eigval_cvg = _is_eigval_cvg(eigvals, eigvals_old, CvgCrit);
         if (is_eigval_cvg || i == imp_maxiter - 1) {
-          eigTens_s = GetEigTens(buffer, eigs[1], maxIndices);
-          bool is_res_small_enough = IsResiduleSmallEnough(Hop, eigTens_s, eigvals, CvgCrit);
+          eigTens_s = _get_eig_tens(buffer, eigs[1], maxIndices);
+          bool is_res_small_enough = _is_residule_small_enough(Hop, eigTens_s, eigvals, CvgCrit);
           if (is_res_small_enough) {
             is_cvg = true;
             break;
@@ -172,9 +207,80 @@ namespace cytnx {
       }
     }
 
+    void _Arnoldi(std::vector<UniTensor> &out, LinOp *Hop, const UniTensor &UT_init,
+                  const std::string which, const cytnx_uint64 &maxiter, const double &CvgCrit,
+                  const cytnx_uint64 &k, const bool &is_V, const bool &verbose) {
+      int dim = 1;
+      auto UT_init_shape = UT_init.shape();
+      for (int i = 0; i < UT_init.rank(); ++i) {
+        dim *= UT_init_shape[i];
+      }
+      const cytnx_uint64 imp_maxiter = std::min(maxiter, Hop->nx() + 1);
+      const cytnx_complex128 unit_complex = 1.0;
+      auto eigvals = std::vector<Scalar>(k, Scalar());
+      std::vector<UniTensor> eigTens_s;
+      Tensor kry_mat_buffer =
+        cytnx::zeros({imp_maxiter + 1, imp_maxiter + 1}, Hop->dtype(), Hop->device());
+      bool is_cvg = false;
+      auto eigvals_old = std::vector<Scalar>(k, Scalar::maxval(Type.Double));
+      std::vector<UniTensor> buffer;
+      buffer.push_back(UT_init);
+      buffer[0] = buffer[0] / buffer[0].Norm().item();  // normalized q1
+
+      // start arnoldi iteration
+      for (auto i = 1; i < imp_maxiter; i++) {
+        cytnx_uint64 krydim = i;
+        auto nextTens = Hop->matvec(buffer[i - 1]).astype(Hop->dtype());
+        buffer.push_back(nextTens);
+        for (cytnx_uint32 j = 0; j < krydim; j++) {
+          auto h = _Dot(buffer[i], buffer[j]).conj();
+          kry_mat_buffer[{i - 1, j}] = h;
+          buffer[i] -= h * buffer[j];
+        }
+        auto h = std::sqrt(static_cast<double>(_Dot(buffer[i], buffer[i]).real()));
+        kry_mat_buffer[{i - 1, i}] = h;
+        buffer[i] /= h;
+        Tensor kry_mat = _resize_mat(kry_mat_buffer, krydim, krydim);
+
+        // call Eig to get eigenvalues
+        auto eigs = Eig(kry_mat, true, true);
+        // get first few order of eigenvlues
+        std::vector<cytnx_int32> maxIndices = _get_fst_few_order_elem_indices(eigs[0], which, k);
+        for (cytnx_int32 ik = 0; ik < k; ++ik) {
+          auto maxIdx = maxIndices[ik];
+          eigvals[ik] = eigs[0].storage().at(maxIdx);
+        }
+
+        // check converged
+        bool is_eigval_cvg = _is_eigval_cvg(eigvals, eigvals_old, CvgCrit);
+        if (is_eigval_cvg || i == imp_maxiter - 1) {
+          eigTens_s = _get_eig_tens(buffer, eigs[1], maxIndices);
+          bool is_res_small_enough = _is_residule_small_enough(Hop, eigTens_s, eigvals, CvgCrit);
+          if (is_res_small_enough) {
+            is_cvg = true;
+            break;
+          }
+        }
+        eigvals_old = eigvals;
+      }  // Arnoldi iteration
+      buffer.clear();
+
+      // set output
+      auto eigvals_tens = zeros({k}, Type.ComplexDouble);  // initialize
+      for (cytnx_int32 ik = 0; ik < k; ++ik) eigvals_tens[{ac(ik)}] = eigvals[ik];
+      out.push_back(UniTensor(eigvals_tens));
+      if (is_V)  // if need output eigentensors
+      {
+        out.insert(out.end(), eigTens_s.begin(), eigTens_s.end());
+      }
+    }
+
     std::vector<Tensor> Arnoldi(LinOp *Hop, const Tensor &T_init, const std::string which,
                                 const cytnx_uint64 &maxiter, const double &cvg_crit,
                                 const cytnx_uint64 &k, const bool &is_V, const bool &verbose) {
+      // check device:
+      cytnx_error_msg(Hop->device() != Device.cpu,
+                      "[ERROR][Arnoldi] Arnoldi still not sopprot cuda devices.%s", "\n");
       // check type:
       cytnx_error_msg(
         !Type.is_float(Hop->dtype()),
@@ -218,6 +324,67 @@ namespace cytnx {
       return out;
     }
 
+    std::vector<UniTensor> Arnoldi(LinOp *Hop, const UniTensor &UT_init, const std::string which,
+                                   const cytnx_uint64 &maxiter, const double &cvg_crit,
+                                   const cytnx_uint64 &k, const bool &is_V, const bool &verbose) {
+      // check device:
+      cytnx_error_msg(Hop->device() != Device.cpu,
+                      "[ERROR][Arnoldi] Arnoldi still not sopprot cuda devices.%s", "\n");
+      // check type:
+
+      cytnx_error_msg(UT_init.uten_type() == UTenType.Block,
+                      "[ERROR][Arnoldi] The Block UniTensor type is still not supported.%s", "\n");
+      // check type:
+      cytnx_error_msg(
+        !Type.is_float(Hop->dtype()),
+        "[ERROR][Arnoldi] Arnoldi can only accept operator with floating types (complex/real)%s",
+        "\n");
+
+      // check which
+      std::vector<std::string> accept_which = {"LM", "LR", "LI", "SM", "SR", "SI"};
+      if (std::find(accept_which.begin(), accept_which.end(), which) == accept_which.end()) {
+        cytnx_error_msg(true,
+                        "[ERROR][Arnoldi] 'which' should be 'LM', 'LR, 'LI'"
+                        ", 'SR, 'SI'",
+                        "\n");
+      }
+      //'SM' is not support for UniTensor since for sparce operator, the eigenvalue results will be
+      // 0.
+      if (which == "SM") {
+        cytnx_error_msg(true,
+                        "[ERROR][Arnoldi] 'which' cannot be 'SM', this function not support to ",
+                        "simulate the smallest magnitude. \n");
+      }
+
+      /// check k
+      cytnx_error_msg(k < 1, "[ERROR][Arnoldi] k should be >0%s", "\n");
+      cytnx_error_msg(k > Hop->nx(),
+                      "[ERROR][Arnoldi] k can only be up to total dimension of input vector D%s",
+                      "\n");
+
+      // check Tin should be rank-1:
+      auto _UT_init = UT_init.clone();
+      if (UT_init.dtype() == Type.Void) {
+        cytnx_error_msg(k < 1, "[ERROR][Arnoldi] The initial UniTensor sould be defined.%s", "\n");
+      } else {
+        auto init_shape = UT_init.shape();
+        int dim = 1;
+        for (auto &x : init_shape) {
+          dim *= x;
+        }
+        cytnx_error_msg(dim != Hop->nx(),
+                        "[ERROR][Arnoldi] Tin should have dimension consistent with Hop: [%d] %s",
+                        Hop->nx(), "\n");
+        _UT_init = UT_init.astype(Hop->dtype());
+      }
+
+      cytnx_error_msg(cvg_crit <= 0, "[ERROR][Arnoldi] cvg_crit should be > 0%s", "\n");
+      double _cvgcrit = cvg_crit;
+      auto out = std::vector<UniTensor>();
+      _Arnoldi(out, Hop, UT_init, which, maxiter, cvg_crit, k, is_V, verbose);
+      return out;
+    }
+
   }  // namespace linalg
 }  // namespace cytnx
 
diff --git a/src/linalg/CMakeLists.txt b/src/linalg/CMakeLists.txt
index 2b3047f99..7515e53f0 100644
--- a/src/linalg/CMakeLists.txt
+++ b/src/linalg/CMakeLists.txt
@@ -63,6 +63,7 @@ target_sources_local(cytnx
     Tridiag.cpp
     Lanczos.cpp
     Lanczos_ER.cpp
+    Lanczos_Exp.cpp
     Lanczos_Gnd.cpp
     Lanczos_Gnd_Ut.cpp
     Lstsq.cpp
diff --git a/src/linalg/Gesvd_truncate.cpp b/src/linalg/Gesvd_truncate.cpp
index 10c0a3db6..4fe567706 100644
--- a/src/linalg/Gesvd_truncate.cpp
+++ b/src/linalg/Gesvd_truncate.cpp
@@ -1,9 +1,10 @@
-#include "linalg.hpp"
-#include "Accessor.hpp"
 #include <vector>
+
+#include "Accessor.hpp"
 #include "Tensor.hpp"
 #include "UniTensor.hpp"
 #include "algo.hpp"
+#include "linalg.hpp"
 
 #ifdef BACKEND_TORCH
 #else
@@ -21,7 +22,7 @@ namespace cytnx {
     typedef Accessor ac;
     std::vector<Tensor> Gesvd_truncate(const Tensor &Tin, const cytnx_uint64 &keepdim,
                                        const double &err, const bool &is_U, const bool &is_vT,
-                                       const unsigned int &return_err) {
+                                       const unsigned int &return_err, const unsigned int &mindim) {
       cytnx_error_msg(Tin.shape().size() != 2,
                       "[Gesvd_truncate] error, Gesvd_truncate can only operate on rank-2 Tensor.%s",
                       "\n");
@@ -31,7 +32,7 @@ namespace cytnx {
         Tensor terr({1}, Tin.dtype(), Tin.device());
 
         cytnx::linalg_internal::lii.memcpyTruncation_ii[Tin.dtype()](
-          tmps[1], tmps[2], tmps[0], terr, keepdim, err, is_U, is_vT, return_err);
+          tmps[1], tmps[2], tmps[0], terr, keepdim, err, is_U, is_vT, return_err, mindim);
 
         std::vector<Tensor> outT;
         outT.push_back(tmps[0]);
@@ -65,7 +66,7 @@ namespace cytnx {
                                                                   S, vT, terr);
 
         cytnx::linalg_internal::lii.cudaMemcpyTruncation_ii[in.dtype()](
-          U, vT, S, terr, keepdim, err, is_U, is_vT, return_err);
+          U, vT, S, terr, keepdim, err, is_U, is_vT, return_err, mindim);
 
         std::vector<Tensor> outT;
         outT.push_back(S);
@@ -80,7 +81,7 @@ namespace cytnx {
         Tensor terr({1}, Tin.dtype(), Tin.device());
 
         cytnx::linalg_internal::lii.cudaMemcpyTruncation_ii[Tin.dtype()](
-          tmps[1], tmps[2], tmps[0], terr, keepdim, err, is_U, is_vT, return_err);
+          tmps[1], tmps[2], tmps[0], terr, keepdim, err, is_U, is_vT, return_err, mindim);
 
         std::vector<Tensor> outT;
         outT.push_back(tmps[0]);
@@ -107,7 +108,8 @@ namespace cytnx {
 
     void _gesvd_truncate_Dense_UT(std::vector<UniTensor> &outCyT, const cytnx::UniTensor &Tin,
                                   const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                                  const bool &is_vT, const unsigned int &return_err) {
+                                  const bool &is_vT, const unsigned int &return_err,
+                                  const unsigned int &mindim) {
       // DenseUniTensor:
       cytnx_uint64 keep_dim = keepdim;
 
@@ -126,7 +128,7 @@ namespace cytnx {
       tmp = tmp.reshape({rowdim, -1});
 
       vector<Tensor> outT =
-        cytnx::linalg::Gesvd_truncate(tmp, keepdim, err, is_U, is_vT, return_err);
+        cytnx::linalg::Gesvd_truncate(tmp, keepdim, err, is_U, is_vT, return_err, mindim);
 
       // if(Tin.is_contiguous()) tmp.reshape_(oldshape);
 
@@ -210,7 +212,8 @@ namespace cytnx {
 
     void _gesvd_truncate_Block_UT(std::vector<UniTensor> &outCyT, const cytnx::UniTensor &Tin,
                                   const cytnx_uint64 &keepdim, const double &err, const bool &is_U,
-                                  const bool &is_vT, const unsigned int &return_err) {
+                                  const bool &is_vT, const unsigned int &return_err,
+                                  const unsigned int &mindim) {
       cytnx_uint64 keep_dim = keepdim;
 
       outCyT = linalg::Gesvd(Tin, is_U, is_vT);
@@ -229,7 +232,7 @@ namespace cytnx {
       if (keep_dim < Sall.shape()[0]) {
         smidx = Sall.shape()[0] - keep_dim;
         Smin = Sall.storage()(smidx);
-        while ((Smin < err)) {
+        while ((Smin < err) and keep_dim - 1 > mindim) {
           keep_dim -= 1;
           if (keep_dim == 0) break;
           smidx = Sall.shape()[0] - keep_dim;
@@ -359,7 +362,8 @@ namespace cytnx {
     std::vector<cytnx::UniTensor> Gesvd_truncate(const cytnx::UniTensor &Tin,
                                                  const cytnx_uint64 &keepdim, const double &err,
                                                  const bool &is_U, const bool &is_vT,
-                                                 const unsigned int &return_err) {
+                                                 const unsigned int &return_err,
+                                                 const unsigned int &mindim) {
       // using rowrank to split the bond to form a matrix.
       cytnx_error_msg((Tin.rowrank() < 1 || Tin.rank() == 1 || Tin.rowrank() == Tin.rank()),
                       "[Gesvd][ERROR] Gesvd for UniTensor should have rank>1 and rank>rowrank>0%s",
@@ -367,9 +371,9 @@ namespace cytnx {
 
       std::vector<UniTensor> outCyT;
       if (Tin.uten_type() == UTenType.Dense) {
-        _gesvd_truncate_Dense_UT(outCyT, Tin, keepdim, err, is_U, is_vT, return_err);
+        _gesvd_truncate_Dense_UT(outCyT, Tin, keepdim, err, is_U, is_vT, return_err, mindim);
       } else if (Tin.uten_type() == UTenType.Block) {
-        _gesvd_truncate_Block_UT(outCyT, Tin, keepdim, err, is_U, is_vT, return_err);
+        _gesvd_truncate_Block_UT(outCyT, Tin, keepdim, err, is_U, is_vT, return_err, mindim);
       } else {
         cytnx_error_msg(true, "[ERROR] only support gesvd for Dense and Block UniTensor.%s", "\n");
       }
diff --git a/src/linalg/Lanczos_Exp.cpp b/src/linalg/Lanczos_Exp.cpp
new file mode 100644
index 000000000..b9d991284
--- /dev/null
+++ b/src/linalg/Lanczos_Exp.cpp
@@ -0,0 +1,408 @@
+#include "linalg.hpp"
+#include "Generator.hpp"
+#include "random.hpp"
+#include "Tensor.hpp"
+#include "LinOp.hpp"
+
+#include <cfloat>
+#include <vector>
+#include <cmath>
+#include "UniTensor.hpp"
+#include "utils/vec_print.hpp"
+#include <iomanip>
+
+#ifdef BACKEND_TORCH
+#else
+
+namespace cytnx {
+  namespace linalg {
+    typedef Accessor ac;
+    using namespace std;
+
+    // <A|B>
+    static Scalar _Dot(const UniTensor &A, const UniTensor &B) {
+      return Contract(A.Dagger(), B).item();
+    }
+
+    // project v to u
+    static UniTensor _Gram_Schimidt_proj(const UniTensor &v, const UniTensor &u) {
+      auto nu = _Dot(u, v);
+      auto de = _Dot(u, u);
+      auto coe = nu / de;
+      return coe * u;
+    }
+
+    static UniTensor _Gram_Schimidt(const std::vector<UniTensor> &vs) {
+      auto u = vs.at(0).clone();
+      double low = -1.0, high = 1.0;
+      random::uniform_(u, low, high);
+      for (auto &v : vs) {
+        u -= _Gram_Schimidt_proj(u, v);
+      }
+      return u;
+    }
+
+    static Tensor _resize_mat(const Tensor &src, const cytnx_uint64 r, const cytnx_uint64 c) {
+      const auto min_r = std::min(r, src.shape()[0]);
+      const auto min_c = std::min(c, src.shape()[1]);
+      // Tensor dst = src[{ac::range(0,min_r),ac::range(0,min_c)}];
+
+      Tensor dst = Tensor({min_r, min_c}, src.dtype(), src.device(), false);
+      char *tgt = (char *)dst.storage().data();
+      char *csc = (char *)src.storage().data();
+      unsigned long long Offset_csc = Type.typeSize(src.dtype()) * src.shape()[1];
+      unsigned long long Offset_tgt = Type.typeSize(src.dtype()) * min_c;
+      for (auto i = 0; i < min_r; ++i) {
+        memcpy(tgt + Offset_tgt * i, csc + Offset_csc * i, Type.typeSize(src.dtype()) * min_c);
+      }
+
+      return dst;
+    }
+
+    // BiCGSTAB method to solve the linear equation
+    // ref: https://en.wikipedia.org/wiki/Biconjugate_gradient_stabilized_method
+    UniTensor _invert_biCGSTAB(LinOp *Hop, const UniTensor &b, const UniTensor &Tin, const int &k,
+                               const double &CvgCrit = 1.0e-12,
+                               const unsigned int &Maxiter = 10000) {
+      // the operation (I + Hop/k) on A
+      auto I_plus_A_Op = [&](UniTensor A) {
+        return ((Hop->matvec(A)) / k + A).relabels_(b.labels());
+      };
+      // the residuals of (b - (I + Hop/k)x)
+      auto r = (b - I_plus_A_Op(Tin)).relabels_(b.labels());
+      // choose r0_hat = r
+      auto r0 = r;
+      auto x = Tin;
+      // choose p = (r0_hat, r)
+      auto p = _Dot(r0, r);
+      // choose pv = r
+      auto pv = r;
+
+      // to reduce the variables used, replace p[i]->p, p[i-1]->p_old, etc.
+      auto p_old = p;
+      auto pv_old = pv;
+      auto x_old = x;
+      auto r_old = r;
+
+      // all of logic here is same
+      // as:https://en.wikipedia.org/wiki/Biconjugate_gradient_stabilized_method
+      for (int i = 1; i < Maxiter; ++i) {
+        auto v = I_plus_A_Op(pv_old);
+        auto a = p_old / _Dot(r0, v);
+        auto h = (x_old + a * pv_old).relabels_(b.labels());
+        auto s = (r_old - a * v).relabels_(b.labels());
+        if (abs(_Dot(s, s)) < CvgCrit) {
+          x = h;
+          break;
+        }
+        auto t = I_plus_A_Op(s);
+        auto w = _Dot(t, s) / _Dot(t, t);
+        x = (h + w * s).relabels_(b.labels());
+        r = (s - w * t).relabels_(b.labels());
+        if (abs(_Dot(r, r)) < CvgCrit) {
+          break;
+        }
+        auto p = _Dot(r0, r);
+        auto beta = (p / p_old) * (a / w);
+        pv = (r + beta * (pv_old - w * v)).relabels_(b.labels());
+
+        // update
+        pv_old = pv;
+        p_old = p;
+        x_old = x;
+        r_old = r;
+      }
+      return x;
+    }
+
+    // ref:  https://doi.org/10.48550/arXiv.1111.1491
+    void _Lanczos_Exp_Ut_positive(UniTensor &out, LinOp *Hop, const UniTensor &Tin,
+                                  const double &CvgCrit, const unsigned int &Maxiter,
+                                  const bool &verbose) {
+      double delta = CvgCrit;
+      int k = static_cast<int>(std::log(1.0 / delta));
+      k = k < Maxiter ? k : Maxiter;
+      auto Op_apprx_norm =
+        static_cast<double>(Hop->matvec(Tin).get_block_().flatten().Norm().item().real());
+      double eps1 = std::exp(-(k * std::log(k) + std::log(1.0 + Op_apprx_norm)));
+
+      std::vector<UniTensor> vs;
+      Tensor as = zeros({(cytnx_uint64)k + 1, (cytnx_uint64)k + 1}, Hop->dtype(), Hop->device());
+
+      // Initialized v0 = v
+      auto v = Tin;
+      auto v0 = v;
+      auto Vk_shape = v0.shape();
+      Vk_shape.insert(Vk_shape.begin(), 1);
+      auto Vk = v0.get_block().reshape(Vk_shape);
+      std::vector<UniTensor> Vs;
+      Vs.push_back(v);
+
+      // For i = 0 to k
+      for (int i = 0; i <= k; ++i) {
+        // Call the procedure Invert_A (v[i], k, eps1). The procedure returns a vector w[i], such
+        // that,
+        // |(I + A / k )^(−1) v[i] − w[i]| ≤ eps1 |v[i]| .
+        auto w = _invert_biCGSTAB(Hop, v, v, k, eps1);
+        // auto resi = ((Hop->matvec(w))/k + w).relabels_(v.labels()) - v;
+
+        // For j = 0 to i
+        for (int j = 0; j <= i; ++j) {
+          // Let a[j,i] = <v[j], w[i]>
+          as.at({(cytnx_uint64)j, (cytnx_uint64)i}) = _Dot(Vs.at(j), w);
+        }
+        // Define wp[i] = w[i] - \sum_{j=0}^{j=i} {a[j,i]v[j]}
+        auto wp = w;
+        for (int j = 0; j <= i; j++) {
+          wp -= as.at({(cytnx_uint64)j, (cytnx_uint64)i}) * Vs.at(j);
+        }
+        // Let a[i+1, i] = |wp[i]|, v[i+1]=wp[i] / a[i+1, i]
+        auto b = std::sqrt(double(_Dot(wp, wp).real()));
+        if (i < k) {
+          as.at({(cytnx_uint64)i + 1, (cytnx_uint64)i}) = b;
+          v = wp / b;
+          Vk.append(v.get_block_());
+          Vs.push_back(v);
+        }
+        // For j = i+2 to k
+        //   Let a[j,i] = 0
+      }
+
+      // Let V_k be the n × (k + 1) matrix whose columns are v[0],...,v[k] respectively.
+      UniTensor Vk_ut(Vk);
+      Vk_ut.set_rowrank_(1);
+      auto VkDag_ut = Vk_ut.Dagger();
+      // Let T_k be the (k + 1) × (k + 1) matrix a[i,j] i,j is {0,...,k} and Tk_hat = 1 / 2
+      // (Tk^Dagger  + Tk).
+      auto asT = as.permute({1, 0}).Conj().contiguous();
+      auto Tk_hat = 0.5 * (asT + as);
+      // Compute B = exp k · (I − Tk_hat^(−1) ) exactly and output the vector V_k*B*V_k^Dagger v.
+      auto I = eye(k + 1);
+      auto B_mat = linalg::ExpH(k * (I - linalg::InvM(Tk_hat)));
+      /*
+       *    |||
+       *  |-----|
+       *  | out |        =
+       *  |_____|
+       *
+       *
+       *    |||
+       *  |-----|
+       *  | V_k |
+       *  |_____|
+       *     |    kl:(k+1) * (k + 1)
+       *     |
+       *  |-----|
+       *  |  B  |
+       *  |_____|
+       *     |    kr:(k+1) * (k + 1)
+       *     |
+       *  |------------|
+       *  | V_k^Dagger |
+       *  |____________|
+       *    |||
+       *  |-----|
+       *  |  v0 |
+       *  |_____|
+       *
+       */
+      auto B = UniTensor(B_mat, false, 1, {"cytnx_internal_label_kl", "cytnx_internal_label_kr"});
+      auto label_kl = B.labels()[0];
+      auto label_kr = B.labels()[1];
+      auto Vk_labels = v0.labels();
+      Vk_labels.insert(Vk_labels.begin(), label_kl);
+      Vk_ut.relabels_(Vk_labels);
+      auto VkDag_labels = v0.labels();
+      VkDag_labels.push_back(label_kr);
+      VkDag_ut.relabels_(VkDag_labels);
+
+      // Vk_ut.print_diagram();
+      // VkDag_ut.print_diagram();
+      // v0.print_diagram();
+      // B.print_diagram();
+
+      out = Contracts({v0, VkDag_ut, B}, "", true);
+      out = Contract(out, Vk_ut);
+      out.set_rowrank_(v0.rowrank());
+    }
+
+    void _Lanczos_Exp_Ut(UniTensor &out, LinOp *Hop, const UniTensor &T, Scalar tau,
+                         const double &CvgCrit, const unsigned int &Maxiter, const bool &verbose) {
+      const double beta_tol = 1.0e-6;
+      std::vector<UniTensor> vs;
+      cytnx_uint32 vec_len = Hop->nx();
+      cytnx_uint32 imp_maxiter = std::min(Maxiter, vec_len + 1);
+      Tensor Hp = zeros({imp_maxiter, imp_maxiter}, Hop->dtype(), Hop->device());
+
+      Tensor B_mat;
+      // prepare initial tensor and normalize
+      auto v = T.clone();
+      auto v_nrm = std::sqrt(double(_Dot(v, v).real()));
+      v = v / v_nrm;
+
+      // first iteration
+      auto wp = (Hop->matvec(v)).relabels_(v.labels());
+      auto alpha = _Dot(wp, v);
+      Hp.at({0, 0}) = alpha;
+      auto w = (wp - alpha * v).relabels_(v.labels());
+
+      // prepare U
+      auto Vk_shape = v.shape();
+      Vk_shape.insert(Vk_shape.begin(), 1);
+      auto Vk = v.get_block().reshape(Vk_shape);
+      std::vector<UniTensor> Vs;
+      Vs.push_back(v);
+      UniTensor v_old;
+      Tensor Hp_sub;
+
+      for (int i = 1; i < imp_maxiter; ++i) {
+        if (verbose) {
+          std::cout << "Lancos iteration:" << i << std::endl;
+        }
+        auto beta = std::sqrt(double(_Dot(w, w).real()));
+        v_old = v.clone();
+        if (beta > beta_tol) {
+          v = (w / beta).relabels_(v.labels());
+        } else {  // beta too small -> the norm of new vector too small. This vector cannot span the
+                  // new dimension
+          if (verbose) {
+            std::cout << "beta too small, pick another vector." << i << std::endl;
+          }
+          // pick a new vector perpendicular to all vector in Vs
+          v = _Gram_Schimidt(Vs).relabels_(v.labels());
+          auto v_norm = _Dot(v, v);
+          // if the picked vector also too small, break and construct expH
+          if (abs(v_norm) <= beta_tol) {
+            if (verbose) {
+              std::cout << "All vector form the space. Break." << i << std::endl;
+            }
+            break;
+          }
+          v = v / v_norm;
+        }
+        Vk.append(v.get_block_().contiguous());
+        Vs.push_back(v);
+        Hp.at({(cytnx_uint64)i, (cytnx_uint64)i - 1}) =
+          Hp.at({(cytnx_uint64)i - 1, (cytnx_uint64)i}) = beta;
+        wp = (Hop->matvec(v)).relabels_(v.labels());
+        alpha = _Dot(wp, v);
+        Hp.at({(cytnx_uint64)i, (cytnx_uint64)i}) = alpha;
+        w = (wp - alpha * v - beta * v_old).relabels_(v.labels());
+
+        // Converge check
+        Hp_sub = _resize_mat(Hp, i + 1, i + 1);
+        // We use ExpM since H*tau may not be Hermitian if tau is complex.
+        B_mat = linalg::ExpM(Hp_sub * tau);
+        // Set the error as the element of bottom left of the exp(H_sub*tau)
+        auto error = abs(B_mat.at({(cytnx_uint64)i, 0}));
+        if (error < CvgCrit || i == imp_maxiter - 1) {
+          if (i == imp_maxiter - 1 && error > CvgCrit) {
+            cytnx_warning_msg(
+              true,
+              "[WARNING][Lanczos_Exp] Fail to converge at eigv [%d], try increasing "
+              "maxiter?\n Note:: ignore if this is intended.%s",
+              imp_maxiter, "\n");
+          }
+          break;
+        }
+      }
+      // std::cout << B_mat;
+
+      // Let V_k be the n × (k + 1) matrix whose columns are v[0],...,v[k] respectively.
+      UniTensor Vk_ut(Vk);
+      Vk_ut.set_rowrank_(1);
+      auto VkDag_ut = Vk_ut.Dagger();
+      /*
+       *    |||
+       *  |-----|
+       *  | out |        =
+       *  |_____|
+       *
+       *
+       *    |||
+       *  |-----|
+       *  | V_k |
+       *  |_____|
+       *     |    kl:(k+1) * (k + 1)
+       *     |
+       *  |-----|
+       *  |  B  |
+       *  |_____|
+       *     |    kr:(k+1) * (k + 1)
+       *     |
+       *  |------------|
+       *  | V_k^Dagger |
+       *  |____________|
+       *    |||
+       *  |-----|
+       *  |  v0 |
+       *  |_____|
+       *
+       */
+
+      auto B = UniTensor(B_mat, false, 1, {"cytnx_internal_label_kl", "cytnx_internal_label_kr"});
+      auto label_kl = B.labels()[0];
+      auto label_kr = B.labels()[1];
+      auto Vk_labels = v.labels();
+      Vk_labels.insert(Vk_labels.begin(), label_kl);
+      Vk_ut.relabels_(Vk_labels);
+      auto VkDag_labels = v.labels();
+      VkDag_labels.push_back(label_kr);
+      VkDag_ut.relabels_(VkDag_labels);
+
+      out = Contracts({T, VkDag_ut, B}, "", true);
+      out = Contract(out, Vk_ut);
+      out.set_rowrank_(v.rowrank());
+    }
+
+    // Lanczos_Exp
+    UniTensor Lanczos_Exp(LinOp *Hop, const UniTensor &Tin, const Scalar &tau,
+                          const double &CvgCrit, const unsigned int &Maxiter, const bool &verbose) {
+      // check device:
+      cytnx_error_msg(Hop->device() != Device.cpu,
+                      "[ERROR][Lanczos_Exp] Lanczos_Exp still not sopprot cuda devices.%s", "\n");
+      // check type:
+      cytnx_error_msg(!Type.is_float(Hop->dtype()),
+                      "[ERROR][Lanczos_Exp] Lanczos_Exp can only accept operator with "
+                      "floating types (complex/real)%s",
+                      "\n");
+
+      cytnx_error_msg(Tin.uten_type() != UTenType.Dense,
+                      "[ERROR][Lanczos_Exp] The Block UniTensor type is still not supported.%s",
+                      "\n");
+
+      // check criteria and maxiter:
+      cytnx_error_msg(CvgCrit <= 0, "[ERROR][Lanczos_Exp] converge criteria must >0%s", "\n");
+      cytnx_error_msg(Maxiter < 2, "[ERROR][Lanczos_Exp] Maxiter must >1%s", "\n");
+
+      // check Tin should be rank-1:
+
+      UniTensor v0;
+      v0 = Tin.astype(Hop->dtype());
+
+      UniTensor out;
+
+      double _cvgcrit = CvgCrit;
+
+      if (Hop->dtype() == Type.Float || Hop->dtype() == Type.ComplexFloat) {
+        if (_cvgcrit < 1.0e-7) {
+          _cvgcrit = 1.0e-7;
+          cytnx_warning_msg(
+            _cvgcrit < 1.0e-7,
+            "[WARNING][Lanczos_Exp] for float precision type, CvgCrit cannot exceed "
+            "it's own type precision limit 1e-7, and it's auto capped to 1.0e-7.%s",
+            "\n");
+        }
+      }
+
+      //_Lanczos_Exp_Ut_positive(out, Hop, v0, _cvgcrit, Maxiter, verbose);
+      _Lanczos_Exp_Ut(out, Hop, v0, tau, _cvgcrit, Maxiter, verbose);
+
+      return out;
+
+    }  // Lanczos_Exp entry point
+
+  }  // namespace linalg
+}  // namespace cytnx
+
+#endif  // BACKEND_TORCH
diff --git a/src/linalg/Lanczos_Gnd_Ut.cpp b/src/linalg/Lanczos_Gnd_Ut.cpp
index 905f54c51..6028c6d47 100644
--- a/src/linalg/Lanczos_Gnd_Ut.cpp
+++ b/src/linalg/Lanczos_Gnd_Ut.cpp
@@ -19,40 +19,16 @@ namespace cytnx {
     typedef Accessor ac;
     using namespace std;
 
-    void unsafe_Sub_(UniTensor &UL, const Scalar &a, const UniTensor &UR) {
-      // perform UL -= a*UR) for each blocks.
-      //[Warning] 1. This function does not check the Bond mismatch of UL and UR. Use with caution.
-      //           2. This function does not check if UL and UR are of the same UTenType!
-      if (UL.uten_type() == UTenType.Block) {
-        for (cytnx_int64 blk = 0; blk < UL.Nblocks(); blk++) {
-          UL.get_block_(UR.get_itoi()[blk]) -= a * UR.get_block_(blk);
-        }
-      } else {
-        for (cytnx_int64 blk = 0; blk < UL.Nblocks(); blk++) {
-          UL.get_block_(blk) -= a * UR.get_block_(blk);
-        }
-      }
-    }
-
-    void unsafe_Add_(UniTensor &UL, const Scalar &a, const UniTensor &UR) {
-      // perform UL += a*UR) for each blocks.
-      //[Warning] 1. This function does not check the Bond mismatch of UL and UR. Use with caution.
-      //           2. This function does not check if UL and UR are of the same UTenType!
-      if (UL.uten_type() == UTenType.Block) {
-        for (cytnx_int64 blk = 0; blk < UL.Nblocks(); blk++) {
-          UL.get_block_(UR.get_itoi()[blk]) += a * UR.get_block_(blk);
-        }
-      } else {
-        for (cytnx_int64 blk = 0; blk < UL.Nblocks(); blk++) {
-          UL.get_block_(blk) += a * UR.get_block_(blk);
-        }
-      }
+    // <A|B>
+    static Scalar _Dot(const UniTensor &A, const UniTensor &B) {
+      return Contract(A.Dagger(), B).item();
     }
 
     void _Lanczos_Gnd_general_Ut(std::vector<UniTensor> &out, LinOp *Hop, const UniTensor &Tin,
                                  const bool &is_V, const double &CvgCrit,
                                  const unsigned int &Maxiter, const bool &verbose) {
       out.clear();
+      std::vector<UniTensor> psi_s;
       //[require] Tin should be provided!
 
       double Norm = double(Contract(Tin, Tin.Dagger()).item().real());
@@ -60,9 +36,9 @@ namespace cytnx {
 
       UniTensor psi_1 = Tin / Norm;
       psi_1.contiguous_();
-
-      // UniTensor psi_1 = Tin.clone();
-      // psi_1.get_block_()/=Tin.get_block_().Norm();
+      if (is_V) {
+        psi_s.push_back(psi_1);
+      }
 
       UniTensor psi_0;  // = cytnx::zeros({psi_1.shape()[0]},psi_1.dtype(),Tin.device());
       UniTensor new_psi;
@@ -91,40 +67,31 @@ namespace cytnx {
                       "[ERROR] LinOp.matvec(UniTensor) -> UniTensor the output should have same "
                       "labels and shape as input!%s",
                       "\n");
-      As(0) = Contract(new_psi.Dagger(), psi_1).item().real();
-      // As(0) =
-      // linalg::Vectordot(new_psi.get_block_().flatten(),psi_1.get_block_().flatten(),true);
-
-      // new_psi -= As(0)*psi_1;
-
-      unsafe_Sub_(new_psi, As(0).item(), psi_1);
-
-      Bs(0) = new_psi.Norm().item();  // sqrt(Contract(new_psi,new_psi.Dagger()).item().real());
-      // Bs(0) = new_psi.get_block_().Norm();
 
+      auto alpha = _Dot(new_psi, psi_1).real();
+      As(0) = alpha;
+      new_psi -= alpha * psi_1;
+      auto beta = new_psi.Norm().item();
+      Bs(0) = beta;
       psi_0 = psi_1;
-
-      new_psi /= Bs(0).item();
-
-      // chekc:
-
+      new_psi /= beta;
       psi_1 = new_psi;
-      E = As(0).item();
+      if (is_V) {
+        psi_s.push_back(psi_1);
+      }
+      E = alpha;
       Scalar Ediff;
 
       ///---------------------------
 
       // iteration LZ:
       for (unsigned int i = 1; i < Maxiter; i++) {
-        // new_psi = Hop->matvec(psi_1) - Bs(i-1)*psi_0;
         new_psi = Hop->matvec(psi_1);
+        alpha = _Dot(new_psi, psi_1).real();
+        As.append(alpha);
+        new_psi -= (alpha * psi_1 + beta * psi_0);
 
-        As.append(Contract(new_psi.Dagger(), psi_1).item().real());
-        // As.append(linalg::Vectordot(new_psi.get_block_().flatten(),psi_1.get_block_().flatten(),true).item());
-
-        unsafe_Sub_(new_psi, As(i).item(), psi_1);
-        unsafe_Sub_(new_psi, Bs(i - 1).item(), psi_0);
-
+        // diagonalize
         try {
           // diagonalize:
           auto tmptmp = linalg::Tridiag(As, Bs, true, true, true);
@@ -136,25 +103,24 @@ namespace cytnx {
           break;
         }
 
-        auto tmpB =
-          new_psi.Norm().item();  // sqrt(Contract(new_psi,new_psi.Dagger()).item().real());
-        Bs.append(tmpB);
+        beta = new_psi.Norm().item();
+        Bs.append(beta);
 
-        if (tmpB == 0) {
+        if (beta == 0) {
           cvg_fin = true;
           break;
         }
 
         psi_0 = psi_1;
-
-        new_psi /= Bs(i).item();
-
-        psi_1 = new_psi;
-
+        psi_1 = new_psi / beta;
+        if (is_V) {
+          psi_s.push_back(psi_1);
+        }
         Ediff = abs(E - tmpEsVs[0].storage().at(0));
-        if (verbose)
+        if (verbose) {
           printf("iter[%d] Enr: %11.14f, diff from last iter: %11.14f\n", i, double(E),
                  double(Ediff));
+        }
 
         // chkf = true;
         if (Ediff < CvgCrit) {
@@ -168,42 +134,19 @@ namespace cytnx {
       if (cvg_fin == false) {
         cytnx_warning_msg(true,
                           "[WARNING] iteration not converge after Maxiter!.\n :: Note :: ignore if "
-                          "this is intended%s",
+                          " this is intended %s",
                           "\n");
       }
       out.push_back(UniTensor(tmpEsVs[0](0), false, 0));
 
       if (is_V) {
         UniTensor eV;
-
         Storage kryVg = tmpEsVs[1](0).storage();
         tmpEsVs.pop_back();
 
-        // restarted again, and evaluate the vectors on the fly:
-
-        psi_1 = Tin.clone();
-        psi_1 /= Norm;
-        // psi_1.contiguous_();
-
-        eV = kryVg.at(0) * psi_1;
-        // new_psi = Hop->matvec(psi_1) - As(0)*psi_1;
-
-        new_psi = Hop->matvec(psi_1);
-        unsafe_Sub_(new_psi, As(0).item(), psi_1);
-
-        psi_0 = psi_1;
-        psi_1 = new_psi / Bs(0).item();
+        eV = kryVg.at(0) * psi_s.at(0);
         for (unsigned int n = 1; n < tmpEsVs[0].shape()[0]; n++) {
-          // eV += kryVg(n)*psi_1;
-          unsafe_Add_(eV, kryVg.at(n), psi_1);
-          // new_psi = Hop->matvec(psi_1) - Bs(n-1)*psi_0;
-          new_psi = Hop->matvec(psi_1);
-          unsafe_Sub_(new_psi, Bs(n - 1).item(), psi_0);
-          // new_psi -= As(n)*psi_1;
-          unsafe_Sub_(new_psi, As(n).item(), psi_1);
-
-          psi_0 = psi_1;
-          psi_1 = new_psi / Bs(n).item();
+          eV += kryVg.at(n) * psi_s.at(n);
         }
 
         out.push_back(eV);
@@ -224,12 +167,7 @@ namespace cytnx {
       cytnx_error_msg(CvgCrit <= 0, "[ERROR][Lanczos] converge criteria must >0%s", "\n");
       cytnx_error_msg(Maxiter < 2, "[ERROR][Lanczos] Maxiter must >1%s", "\n");
 
-      // check Tin should be rank-1:
-
       UniTensor v0;
-      // cytnx_error_msg(Tin.shape().size()!=1,"[ERROR][Lanczos] Tin should be rank-1%s","\n");
-      // cytnx_error_msg(Tin.shape()[0]!=Hop->nx(),"[ERROR][Lanczos] Tin should have dimension
-      // consistent with Hop: [%d] %s",Hop->nx(),"\n");
       v0 = Tin.astype(Hop->dtype());
 
       std::vector<UniTensor> out;
@@ -248,28 +186,6 @@ namespace cytnx {
 
       _Lanczos_Gnd_general_Ut(out, Hop, v0, is_V, _cvgcrit, Maxiter, verbose);
 
-      /*
-      if(Hop->dtype()==Type.ComplexDouble){
-          _Lanczos_Gnd_cd(out,Hop,v0,is_V,CvgCrit,Maxiter,verbose);
-      }else if(Hop->dtype()==Type.Double){
-          _Lanczos_Gnd_d(out,Hop,v0,is_V,CvgCrit,Maxiter,verbose);
-      }else if(Hop->dtype()==Type.ComplexFloat){
-          cytnx_warning_msg(CvgCrit<1.0e-7,"[WARNING][CvgCrit] for float precision type, CvgCrit
-      cannot exceed it's own type precision limit 1e-7, and it's auto capped to 1.0e-7.%s","\n");
-          if(CvgCrit<1.0e-7)
-              _Lanczos_Gnd_cf(out,Hop,v0,is_V,1.0e-7,Maxiter,verbose);
-          else
-              _Lanczos_Gnd_cf(out,Hop,v0,is_V,CvgCrit,Maxiter,verbose);
-      }else{
-          cytnx_warning_msg(CvgCrit<1.0e-7,"[WARNING][CvgCrit] for float precision type, CvgCrit
-      cannot exceed it's own type precision limit 1e-7, and it's auto capped to 1.0e-7.%s","\n");
-          if(CvgCrit<1.0e-7)
-              _Lanczos_Gnd_f(out,Hop,v0,is_V,1.0e-7,Maxiter,verbose);
-          else
-              _Lanczos_Gnd_f(out,Hop,v0,is_V,CvgCrit,Maxiter,verbose);
-      }
-      */
-
       return out;
 
     }  // Lanczos_Gnd entry point
diff --git a/src/linalg/Svd_truncate.cpp b/src/linalg/Svd_truncate.cpp
index ed425651c..dc0c73310 100644
--- a/src/linalg/Svd_truncate.cpp
+++ b/src/linalg/Svd_truncate.cpp
@@ -1,9 +1,10 @@
-#include "linalg.hpp"
-#include "Accessor.hpp"
 #include <vector>
+
+#include "Accessor.hpp"
 #include "Tensor.hpp"
 #include "UniTensor.hpp"
 #include "algo.hpp"
+#include "linalg.hpp"
 
 #ifdef BACKEND_TORCH
 #else
@@ -14,7 +15,7 @@ namespace cytnx {
     typedef Accessor ac;
     std::vector<Tensor> Svd_truncate(const Tensor &Tin, const cytnx_uint64 &keepdim,
                                      const double &err, const bool &is_UvT,
-                                     const unsigned int &return_err) {
+                                     const unsigned int &return_err, const unsigned int &mindim) {
       cytnx_error_msg(return_err < 0, "[ERROR] return_err can only be positive int%s", "\n");
       if (Tin.device() == Device.cpu) {
         std::vector<Tensor> tmps = Svd(Tin, is_UvT);
@@ -22,7 +23,7 @@ namespace cytnx {
         Tensor terr({1}, Tin.dtype(), Tin.device());
 
         cytnx::linalg_internal::lii.memcpyTruncation_ii[Tin.dtype()](
-          tmps[1], tmps[2], tmps[0], terr, keepdim, err, is_UvT, is_UvT, return_err);
+          tmps[1], tmps[2], tmps[0], terr, keepdim, err, is_UvT, is_UvT, return_err, mindim);
 
         std::vector<Tensor> outT;
         outT.push_back(tmps[0]);
@@ -39,7 +40,7 @@ namespace cytnx {
         Tensor terr({1}, Tin.dtype(), Tin.device());
 
         cytnx::linalg_internal::lii.cudaMemcpyTruncation_ii[Tin.dtype()](
-          tmps[1], tmps[2], tmps[0], terr, keepdim, err, is_UvT, is_UvT, return_err);
+          tmps[1], tmps[2], tmps[0], terr, keepdim, err, is_UvT, is_UvT, return_err, mindim);
 
         std::vector<Tensor> outT;
         outT.push_back(tmps[0]);
@@ -67,7 +68,7 @@ namespace cytnx {
 
     void _svd_truncate_Dense_UT(std::vector<UniTensor> &outCyT, const cytnx::UniTensor &Tin,
                                 const cytnx_uint64 &keepdim, const double &err, const bool &is_UvT,
-                                const unsigned int &return_err) {
+                                const unsigned int &return_err, const unsigned int &mindim) {
       // DenseUniTensor:
       cytnx_uint64 keep_dim = keepdim;
 
@@ -85,7 +86,8 @@ namespace cytnx {
       for (cytnx_uint64 i = 0; i < Tin.rowrank(); i++) rowdim *= tmp.shape()[i];
       tmp = tmp.reshape({rowdim, -1});
 
-      vector<Tensor> outT = cytnx::linalg::Svd_truncate(tmp, keepdim, err, is_UvT, return_err);
+      vector<Tensor> outT =
+        cytnx::linalg::Svd_truncate(tmp, keepdim, err, is_UvT, return_err, mindim);
 
       // if(Tin.is_contiguous()) tmp.reshape_(oldshape);
 
@@ -169,7 +171,7 @@ namespace cytnx {
 
     void _svd_truncate_Block_UT(std::vector<UniTensor> &outCyT, const cytnx::UniTensor &Tin,
                                 const cytnx_uint64 &keepdim, const double &err, const bool &is_UvT,
-                                const int &return_err) {
+                                const int &return_err, const unsigned int &mindim) {
       cytnx_uint64 keep_dim = keepdim;
 
       outCyT = linalg::Gesvd(Tin, is_UvT, is_UvT);
@@ -199,7 +201,7 @@ namespace cytnx {
         keep_dim = Sall.shape()[0];
         Smin = Sall.storage()(0);
         smidx = 0;
-        while ((Smin < err)) {
+        while ((Smin < err) and keep_dim - 1 > mindim) {
           keep_dim -= 1;
           if (keep_dim == 0) break;
           smidx = Sall.shape()[0] - keep_dim;
@@ -317,7 +319,8 @@ namespace cytnx {
 
     std::vector<cytnx::UniTensor> Svd_truncate(const cytnx::UniTensor &Tin,
                                                const cytnx_uint64 &keepdim, const double &err,
-                                               const bool &is_UvT, const unsigned int &return_err) {
+                                               const bool &is_UvT, const unsigned int &return_err,
+                                               const unsigned int &mindim) {
       // using rowrank to split the bond to form a matrix.
       cytnx_error_msg((Tin.rowrank() < 1 || Tin.rank() == 1 || Tin.rowrank() == Tin.rank()),
                       "[Svd][ERROR] Svd for UniTensor should have rank>1 and rank>rowrank>0%s",
@@ -325,10 +328,10 @@ namespace cytnx {
 
       std::vector<UniTensor> outCyT;
       if (Tin.uten_type() == UTenType.Dense) {
-        _svd_truncate_Dense_UT(outCyT, Tin, keepdim, err, is_UvT, return_err);
+        _svd_truncate_Dense_UT(outCyT, Tin, keepdim, err, is_UvT, return_err, mindim);
 
       } else if (Tin.uten_type() == UTenType.Block) {
-        _svd_truncate_Block_UT(outCyT, Tin, keepdim, err, is_UvT, return_err);
+        _svd_truncate_Block_UT(outCyT, Tin, keepdim, err, is_UvT, return_err, mindim);
 
       } else {
         cytnx_error_msg(true, "[ERROR] svd_truncate for UniTensor only support Dense/Block.%s",
diff --git a/src/linalg/Tensordot.cpp b/src/linalg/Tensordot.cpp
index ea5c9e5ca..63b927528 100644
--- a/src/linalg/Tensordot.cpp
+++ b/src/linalg/Tensordot.cpp
@@ -98,8 +98,8 @@ namespace cytnx {
                          const std::vector<cytnx_uint64> &idxr, const bool &cacheL,
                          const bool &cacheR) {
       unsigned int t = Tl.dtype();
-      if (t == Type.Uint64 || t == Type.Int64 || t == Type.Uint16 || t == Type.Int16 ||
-          t == Type.Bool) {
+      if (t == Type.Uint64 || t == Type.Int64 || t == Type.Uint32 || t == Type.Int32 ||
+          t == Type.Uint16 || t == Type.Int16 || t == Type.Bool) {
         cytnx_warning_msg(true, "Unsupported data type in cuTensor: %s, use default implementation",
                           Type.Typeinfos[Tl.dtype()].name);
         return _Tensordot_generic(out, Tl, Tr, idxl, idxr, cacheL, cacheR);
diff --git a/src/linalg/Trace.cpp b/src/linalg/Trace.cpp
index 0fe79959e..e1c8b778b 100644
--- a/src/linalg/Trace.cpp
+++ b/src/linalg/Trace.cpp
@@ -2,7 +2,7 @@
 #include "utils/utils.hpp"
 #include "Tensor.hpp"
 #include "UniTensor.hpp"
-//#include "cytnx.hpp"
+// #include "cytnx.hpp"
 
 #ifdef BACKEND_TORCH
 #else
diff --git a/tests/BlockUniTensor_test.cpp b/tests/BlockUniTensor_test.cpp
index e498dd5fa..81b09aee1 100644
--- a/tests/BlockUniTensor_test.cpp
+++ b/tests/BlockUniTensor_test.cpp
@@ -149,6 +149,25 @@ TEST_F(BlockUniTensorTest, relabels_) {
 }
 
 TEST_F(BlockUniTensorTest, relabel) {
+  auto tmp = BUT1.clone();
+  BUT1 = BUT1.relabel({"a", "b", "cd", "d"});
+  EXPECT_EQ(BUT1.labels()[0], "a");
+  EXPECT_EQ(BUT1.labels()[1], "b");
+  EXPECT_EQ(BUT1.labels()[2], "cd");
+  EXPECT_EQ(BUT1.labels()[3], "d");
+  BUT1 = BUT1.relabel({"1", "-1", "2", "1000"});
+  EXPECT_EQ(BUT1.labels()[0], "1");
+  EXPECT_EQ(BUT1.labels()[1], "-1");
+  EXPECT_EQ(BUT1.labels()[2], "2");
+  EXPECT_EQ(BUT1.labels()[3], "1000");
+
+  EXPECT_THROW(BUT1.relabel({"a", "a", "b", "c"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabel({"1", "1", "0", "-1"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabel({"a"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabel({"1", "2"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabel({"a", "b", "c", "d", "e"}), std::logic_error);
+
+  BUT1 = tmp;
   BUT1 = BUT1.relabel("0", "a");
   BUT1 = BUT1.relabel("1", "b");
   BUT1 = BUT1.relabel("2", "d");
@@ -183,6 +202,24 @@ TEST_F(BlockUniTensorTest, relabel) {
   // EXPECT_THROW(BUT1.relabel(5,'a'),std::logic_error);
 }
 TEST_F(BlockUniTensorTest, relabel_) {
+  auto tmp = BUT1.clone();
+  BUT1.relabel_({"a", "b", "cd", "d"});
+  EXPECT_EQ(BUT1.labels()[0], "a");
+  EXPECT_EQ(BUT1.labels()[1], "b");
+  EXPECT_EQ(BUT1.labels()[2], "cd");
+  EXPECT_EQ(BUT1.labels()[3], "d");
+  BUT1.relabel_({"1", "-1", "2", "1000"});
+  EXPECT_EQ(BUT1.labels()[0], "1");
+  EXPECT_EQ(BUT1.labels()[1], "-1");
+  EXPECT_EQ(BUT1.labels()[2], "2");
+  EXPECT_EQ(BUT1.labels()[3], "1000");
+  EXPECT_THROW(BUT1.relabel_({"a", "a", "b", "c"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabel_({"1", "1", "0", "-1"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabel_({"a"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabel_({"1", "2"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabel_({"a", "b", "c", "d", "e"}), std::logic_error);
+
+  BUT1 = tmp;
   BUT1.relabel_("0", "a");
   BUT1.relabel_("1", "b");
   BUT1.relabel_("2", "d");
diff --git a/tests/Bond_test.cpp b/tests/Bond_test.cpp
index 6dfa5aabd..6d7a1de20 100644
--- a/tests/Bond_test.cpp
+++ b/tests/Bond_test.cpp
@@ -155,7 +155,7 @@ TEST(Bond, CombindBondSymm_v2) {
 
   Bond bd_sym_b = Bond(BD_BRA, {{0, 0}, {2, 1}}, {1, 1}, {Symmetry::U1(), Symmetry::Zn(2)});
   Bond bd_sym_c = Bond(BD_BRA, {{1, 1}}, {2}, {Symmetry::U1(), Symmetry::Zn(2)});
-  Bond bd_sym_d = bd_sym_a.combineBonds({bd_sym_b, bd_sym_c});
+  Bond bd_sym_d = bd_sym_a.combineBond({bd_sym_b, bd_sym_c});
   EXPECT_EQ(bd_sym_d.type(), BD_BRA);
   EXPECT_EQ(bd_sym_d, Bond(BD_BRA, {{1, 0}, {3, 1}}, {6, 6}, {Symmetry::U1(), Symmetry::Zn(2)}));
 
diff --git a/tests/Bond_test.h b/tests/Bond_test.h
index abf1dd2f5..8625be037 100644
--- a/tests/Bond_test.h
+++ b/tests/Bond_test.h
@@ -1,6 +1,6 @@
 #pragma once
-//#ifndef __CYTNX_TEST_BOND_H__
-//#define __CYTNX_TEST_BOND_H__
+// #ifndef __CYTNX_TEST_BOND_H__
+// #define __CYTNX_TEST_BOND_H__
 #include <time.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 75085cc6c..94e198dd4 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -11,7 +11,7 @@ add_executable(
   Network_test.cpp
   UniTensor_base_test.cpp
   ncon_test.cpp
-  Contracts_test.cpp
+  Contract_test.cpp
   BlockUniTensor_test.cpp
   DenseUniTensor_test.cpp
   Accessor_test.cpp
@@ -23,8 +23,10 @@ add_executable(
   linalg_test/Directsum_test.cpp
   linalg_test/ExpH_test.cpp
   linalg_test/ExpM_test.cpp
+  linalg_test/Lanczos_Exp_test.cpp
   linalg_test/Lanczos_Gnd_test.cpp
   linalg_test/Arnoldi_test.cpp
+  linalg_test/Arnoldi_Ut_test.cpp
   linalg_test/Svd_test.cpp
   linalg_test/GeSvd_test.cpp
   linalg_test/linalg_test.cpp
diff --git a/tests/Contract_test.cpp b/tests/Contract_test.cpp
new file mode 100644
index 000000000..1734a3ebb
--- /dev/null
+++ b/tests/Contract_test.cpp
@@ -0,0 +1,49 @@
+#include "Contract_test.h"
+
+using namespace std;
+using namespace cytnx;
+
+TEST_F(ContractTest, Contract_denseUt_optimal_order) {
+  UniTensor res = Contract({utdnA, utdnB, utdnC}, "", true);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block(), utdnAns.get_block(), 1e-12));
+}
+
+TEST_F(ContractTest, Contract_denseUt_default_order) {
+  UniTensor res = Contract({utdnA, utdnB, utdnC}, "", false);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block(), utdnAns.get_block(), 1e-12));
+}
+
+TEST_F(ContractTest, Contract_denseUt_specified_order) {
+  UniTensor res =
+    Contract({utdnA.set_name("A"), utdnB.set_name("B"), utdnC.set_name("C")}, "(C,(A,B))", false);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block().contiguous(), utdnAns.get_block(), 1e-12));
+}
+
+TEST_F(ContractTest, Contract_denseUt_optimal_specified_order) {
+  UniTensor res =
+    Contract({utdnA.set_name("A"), utdnB.set_name("B"), utdnC.set_name("C")}, "(C,(A,B))", true);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block().contiguous(), utdnAns.get_block(), 1e-12));
+}
+
+// Deprecated Contracts
+TEST_F(ContractTest, Contracts_denseUt_optimal_order) {
+  UniTensor res = Contracts({utdnA, utdnB, utdnC}, "", true);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block(), utdnAns.get_block(), 1e-12));
+}
+
+TEST_F(ContractTest, Contracts_denseUt_default_order) {
+  UniTensor res = Contracts({utdnA, utdnB, utdnC}, "", false);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block(), utdnAns.get_block(), 1e-12));
+}
+
+TEST_F(ContractTest, Contracts_denseUt_specified_order) {
+  UniTensor res =
+    Contracts({utdnA.set_name("A"), utdnB.set_name("B"), utdnC.set_name("C")}, "(C,(A,B))", false);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block().contiguous(), utdnAns.get_block(), 1e-12));
+}
+
+TEST_F(ContractTest, Contracts_denseUt_optimal_specified_order) {
+  UniTensor res =
+    Contracts({utdnA.set_name("A"), utdnB.set_name("B"), utdnC.set_name("C")}, "(C,(A,B))", true);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block().contiguous(), utdnAns.get_block(), 1e-12));
+}
diff --git a/tests/Contracts_test.h b/tests/Contract_test.h
similarity index 92%
rename from tests/Contracts_test.h
rename to tests/Contract_test.h
index ea43f4731..88a6d470b 100644
--- a/tests/Contracts_test.h
+++ b/tests/Contract_test.h
@@ -1,5 +1,5 @@
-#ifndef _H_contracts_test
-#define _H_contracts_test
+#ifndef _H_contract_test
+#define _H_contract_test
 
 #include "cytnx.hpp"
 #include <gtest/gtest.h>
@@ -8,7 +8,7 @@
 using namespace cytnx;
 using namespace TestTools;
 
-class ContractsTest : public ::testing::Test {
+class ContractTest : public ::testing::Test {
  public:
   // std::pair<std::vector<cytnx::UniTensor>, std::vector<std::vector<cytnx::cytnx_int64>>> input;
   UniTensor utdnA = UniTensor(arange(0, 8, 1, Type.ComplexDouble)).reshape({2, 2, 2});
diff --git a/tests/Contracts_test.cpp b/tests/Contracts_test.cpp
deleted file mode 100644
index d3f953b1c..000000000
--- a/tests/Contracts_test.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "Contracts_test.h"
-
-using namespace std;
-using namespace cytnx;
-
-TEST_F(ContractsTest, Contracts_denseUt_optimal_order) {
-  UniTensor res = Contracts({utdnA, utdnB, utdnC}, "", true);
-  EXPECT_TRUE(AreNearlyEqTensor(res.get_block(), utdnAns.get_block(), 1e-12));
-}
-
-TEST_F(ContractsTest, Contracts_denseUt_default_order) {
-  UniTensor res = Contracts({utdnA, utdnB, utdnC}, "", false);
-  EXPECT_TRUE(AreNearlyEqTensor(res.get_block(), utdnAns.get_block(), 1e-12));
-}
-
-TEST_F(ContractsTest, Contracts_denseUt_specified_order) {
-  UniTensor res =
-    Contracts({utdnA.set_name("A"), utdnB.set_name("B"), utdnC.set_name("C")}, "(C,(A,B))", false);
-  EXPECT_TRUE(AreNearlyEqTensor(res.get_block().contiguous(), utdnAns.get_block(), 1e-12));
-}
-
-TEST_F(ContractsTest, Contracts_denseUt_optimal_specified_order) {
-  UniTensor res =
-    Contracts({utdnA.set_name("A"), utdnB.set_name("B"), utdnC.set_name("C")}, "(C,(A,B))", true);
-  EXPECT_TRUE(AreNearlyEqTensor(res.get_block().contiguous(), utdnAns.get_block(), 1e-12));
-}
diff --git a/tests/DenseUniTensor_test.cpp b/tests/DenseUniTensor_test.cpp
index 2e99304d4..d6dcf06a6 100644
--- a/tests/DenseUniTensor_test.cpp
+++ b/tests/DenseUniTensor_test.cpp
@@ -575,7 +575,26 @@ TEST_F(DenseUniTensorTest, relabels_) {
 }
 
 TEST_F(DenseUniTensorTest, relabel) {
-  UniTensor ut;
+  auto tmp = utzero3456.clone();
+  auto ut = utzero3456.relabel({"a", "b", "cd", "d"});
+  EXPECT_EQ(utzero3456.labels()[0], "0");
+  EXPECT_EQ(utzero3456.labels()[1], "1");
+  EXPECT_EQ(utzero3456.labels()[2], "2");
+  EXPECT_EQ(utzero3456.labels()[3], "3");
+  EXPECT_EQ(ut.labels()[0], "a");
+  EXPECT_EQ(ut.labels()[1], "b");
+  EXPECT_EQ(ut.labels()[2], "cd");
+  EXPECT_EQ(ut.labels()[3], "d");
+  ut = utzero3456.relabel({"1", "-1", "2", "1000"});
+  EXPECT_THROW(ut.relabel({"a", "a", "b", "c"}), std::logic_error);
+  EXPECT_THROW(ut.relabel({"1", "1", "0", "-1"}), std::logic_error);
+  EXPECT_THROW(ut.relabel({"a"}), std::logic_error);
+  EXPECT_THROW(ut.relabel({"1", "2"}), std::logic_error);
+  EXPECT_THROW(ut.relabel({"a", "b", "c", "d", "e"}), std::logic_error);
+  EXPECT_THROW(ut_uninit.relabel({"a", "b", "c", "d", "e"}), std::logic_error);
+
+  utzero3456 = tmp;
+  // UniTensor ut;
   ut = utzero3456.relabel("0", "a");
   EXPECT_EQ(ut.labels()[0], "a");
   ut = utzero3456.relabel("1", "b");
@@ -617,7 +636,26 @@ TEST_F(DenseUniTensorTest, relabel) {
   EXPECT_THROW(ut_uninit.relabel(0, ""), std::logic_error);
 }
 TEST_F(DenseUniTensorTest, relabel_) {
-  UniTensor ut;
+  auto tmp = utzero3456.clone();
+  auto ut = utzero3456.relabel_({"a", "b", "cd", "d"});
+  EXPECT_EQ(utzero3456.labels()[0], "a");
+  EXPECT_EQ(utzero3456.labels()[1], "b");
+  EXPECT_EQ(utzero3456.labels()[2], "cd");
+  EXPECT_EQ(utzero3456.labels()[3], "d");
+  EXPECT_EQ(ut.labels()[0], "a");
+  EXPECT_EQ(ut.labels()[1], "b");
+  EXPECT_EQ(ut.labels()[2], "cd");
+  EXPECT_EQ(ut.labels()[3], "d");
+  ut = utzero3456.relabel_({"1", "-1", "2", "1000"});
+  EXPECT_THROW(ut.relabel_({"a", "a", "b", "c"}), std::logic_error);
+  EXPECT_THROW(ut.relabel_({"1", "1", "0", "-1"}), std::logic_error);
+  EXPECT_THROW(ut.relabel_({"a"}), std::logic_error);
+  EXPECT_THROW(ut.relabel_({"1", "2"}), std::logic_error);
+  EXPECT_THROW(ut.relabel_({"a", "b", "c", "d", "e"}), std::logic_error);
+  EXPECT_THROW(ut_uninit.relabel_({"a", "b", "c", "d", "e"}), std::logic_error);
+
+  utzero3456 = tmp;
+  // UniTensor ut;
   ut = utzero3456.relabel_("0", "a");
   ut = utzero3456.relabel_("1", "b");
   ut = utzero3456.relabel_("2", "d");
@@ -1862,6 +1900,63 @@ TEST_F(DenseUniTensorTest, combineBonds_ut_uninit) {
   EXPECT_ANY_THROW(ut_uninit.combineBonds(labels_combine));
 }
 
+/*=====test info=====
+describe:test combineBond
+====================*/
+TEST_F(DenseUniTensorTest, combineBond) {
+  std::vector<std::string> labels = {"a", "b", "c"};
+  auto ut = UniTensor({Bond(5), Bond(4), Bond(3)}, labels);
+  ut.set_rowrank(1);
+  int seed = 0;
+  random::uniform_(ut, -100.0, 100.0, seed);
+  std::vector<std::string> labels_combine = {"b", "c"};
+  ut.combineBond(labels_combine);
+
+  // construct answer directly
+  labels = {"a", "b"};
+  int rowrank = 1;
+  auto ans_ut = UniTensor({Bond(5), Bond(12)}, labels, rowrank);
+  auto tens = ut.get_block().reshape({5, 12});
+  ans_ut.put_block(tens);
+
+  // compare
+  EXPECT_TRUE(AreEqUniTensor(ut, ans_ut));
+}
+
+/*=====test info=====
+describe:test combineBond with diagonal UniTensor
+====================*/
+TEST_F(DenseUniTensorTest, combineBond_diag) {
+  EXPECT_THROW(ut_complex_diag.combineBond(ut_complex_diag.labels()), std::logic_error);
+}
+
+/*=====test info=====
+describe:test combineBond error
+====================*/
+TEST_F(DenseUniTensorTest, combineBond_error) {
+  std::vector<std::string> labels = {"a", "b", "c"};
+  auto ut = UniTensor({Bond(5), Bond(4), Bond(3)}, labels);
+  ut.set_rowrank(1);
+  int seed = 0;
+  random::uniform_(ut, -100.0, 100.0, seed);
+
+  // not exist labels
+  std::vector<std::string> labels_combine = {"c", "d"};
+  EXPECT_THROW(ut.combineBond(labels_combine), std::logic_error);
+
+  // empty combine's label
+  labels_combine = std::vector<std::string>();
+  EXPECT_THROW(ut.combineBond(labels_combine), std::logic_error);
+}
+
+/*=====test info=====
+describe:test combindbonds with uninitialized UniTensor
+====================*/
+TEST_F(DenseUniTensorTest, combineBond_ut_uninit) {
+  std::vector<std::string> labels_combine = {};
+  EXPECT_ANY_THROW(ut_uninit.combineBond(labels_combine));
+}
+
 TEST_F(DenseUniTensorTest, contract1) {
   ut1.set_labels({"a", "b", "c", "d"});
   ut2.set_labels({"a", "aa", "bb", "cc"});
diff --git a/tests/Network_test.cpp b/tests/Network_test.cpp
index a306a0b23..a13763c99 100644
--- a/tests/Network_test.cpp
+++ b/tests/Network_test.cpp
@@ -80,5 +80,5 @@ TEST_F(NetworkTest, Network_dense_TOUT_no_colon) {
   net.PutUniTensors({"A", "B", "C"}, {utdnA, utdnB, utdnC});
   auto res = net.Launch();
   EXPECT_TRUE(AreNearlyEqTensor(res.get_block(), utdnAns.get_block(), 1e-12));
-  EXPECT_EQ(res.rowrank(), 0);
+  EXPECT_EQ(res.rowrank(), 1);
 }
diff --git a/tests/OLDtest.cpp b/tests/OLDtest.cpp
index 8bd91afcd..63e5d8667 100644
--- a/tests/OLDtest.cpp
+++ b/tests/OLDtest.cpp
@@ -4,7 +4,7 @@
 #include <functional>
 #include <type_traits>
 #include "hptt.h"
-//#include "cutt.h"
+// #include "cutt.h"
 using namespace std;
 using namespace cytnx;
 
@@ -199,8 +199,8 @@ int main(int argc, char *argv[]) {
   auto Tc = zeros({4, 1, 1});
   Tc.Conj_();
 
-  auto L0 = UniTensor(zeros({4, 1, 1}), 0);  //#Left boundary
-  auto R0 = UniTensor(zeros({4, 1, 1}), 0);  //#Right boundary
+  auto L0 = UniTensor(zeros({4, 1, 1}), 0);  // #Left boundary
+  auto R0 = UniTensor(zeros({4, 1, 1}), 0);  // #Right boundary
   L0.get_block_()(0, 0, 0) = 1.;
   R0.get_block_()(3, 0, 0) = 1.;
 
diff --git a/tests/gpu/BlockUniTensor_test.cpp b/tests/gpu/BlockUniTensor_test.cpp
index a0dc29c9f..f43a79751 100644
--- a/tests/gpu/BlockUniTensor_test.cpp
+++ b/tests/gpu/BlockUniTensor_test.cpp
@@ -74,6 +74,25 @@ TEST_F(BlockUniTensorTest, gpu_relabels_) {
 }
 
 TEST_F(BlockUniTensorTest, gpu_relabel) {
+  auto tmp = BUT1.clone();
+  BUT1 = BUT1.relabels({"a", "b", "cd", "d"});
+  EXPECT_EQ(BUT1.labels()[0], "a");
+  EXPECT_EQ(BUT1.labels()[1], "b");
+  EXPECT_EQ(BUT1.labels()[2], "cd");
+  EXPECT_EQ(BUT1.labels()[3], "d");
+  BUT1 = BUT1.relabels({"1", "-1", "2", "1000"});
+  EXPECT_EQ(BUT1.labels()[0], "1");
+  EXPECT_EQ(BUT1.labels()[1], "-1");
+  EXPECT_EQ(BUT1.labels()[2], "2");
+  EXPECT_EQ(BUT1.labels()[3], "1000");
+
+  EXPECT_THROW(BUT1.relabels({"a", "a", "b", "c"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabels({"1", "1", "0", "-1"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabels({"a"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabels({"1", "2"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabels({"a", "b", "c", "d", "e"}), std::logic_error);
+
+  BUT1 = tmp;
   BUT1 = BUT1.relabel("0", "a");
   BUT1 = BUT1.relabel("1", "b");
   BUT1 = BUT1.relabel("2", "d");
@@ -108,6 +127,24 @@ TEST_F(BlockUniTensorTest, gpu_relabel) {
   // EXPECT_THROW(BUT1.relabel(5,'a'),std::logic_error);
 }
 TEST_F(BlockUniTensorTest, gpu_relabel_) {
+  auto tmp = BUT1.clone();
+  BUT1.relabels_({"a", "b", "cd", "d"});
+  EXPECT_EQ(BUT1.labels()[0], "a");
+  EXPECT_EQ(BUT1.labels()[1], "b");
+  EXPECT_EQ(BUT1.labels()[2], "cd");
+  EXPECT_EQ(BUT1.labels()[3], "d");
+  BUT1.relabels_({"1", "-1", "2", "1000"});
+  EXPECT_EQ(BUT1.labels()[0], "1");
+  EXPECT_EQ(BUT1.labels()[1], "-1");
+  EXPECT_EQ(BUT1.labels()[2], "2");
+  EXPECT_EQ(BUT1.labels()[3], "1000");
+  EXPECT_THROW(BUT1.relabels_({"a", "a", "b", "c"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabels_({"1", "1", "0", "-1"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabels_({"a"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabels_({"1", "2"}), std::logic_error);
+  EXPECT_THROW(BUT1.relabels_({"a", "b", "c", "d", "e"}), std::logic_error);
+
+  BUT1 = tmp;
   BUT1.relabel_("0", "a");
   BUT1.relabel_("1", "b");
   BUT1.relabel_("2", "d");
diff --git a/tests/gpu/Bond_test.cpp b/tests/gpu/Bond_test.cpp
index df3964adc..f78eb2dca 100644
--- a/tests/gpu/Bond_test.cpp
+++ b/tests/gpu/Bond_test.cpp
@@ -49,7 +49,7 @@ TEST(Bond, gpu_CombineBondNoSymmBraKet) {
   EXPECT_EQ(bd5.type(), BD_BRA);
   bd3.set_type(BD_BRA);
   bd4.set_type(BD_BRA);
-  Bond bd_all = bd1.combineBonds({bd2, bd3, bd4});
+  Bond bd_all = bd1.combineBond({bd2, bd3, bd4});
   EXPECT_EQ(bd_all.dim(), 210);
   EXPECT_EQ(bd_all.type(), BD_BRA);
   EXPECT_EQ(bd_all.Nsym(), 0);
@@ -155,7 +155,7 @@ TEST(Bond, gpu_CombindBondSymm_v2) {
 
   Bond bd_sym_b = Bond(BD_BRA, {{0, 0}, {2, 1}}, {1, 1}, {Symmetry::U1(), Symmetry::Zn(2)});
   Bond bd_sym_c = Bond(BD_BRA, {{1, 1}}, {2}, {Symmetry::U1(), Symmetry::Zn(2)});
-  Bond bd_sym_d = bd_sym_a.combineBonds({bd_sym_b, bd_sym_c});
+  Bond bd_sym_d = bd_sym_a.combineBond({bd_sym_b, bd_sym_c});
   EXPECT_EQ(bd_sym_d.type(), BD_BRA);
   EXPECT_EQ(bd_sym_d, Bond(BD_BRA, {{1, 0}, {3, 1}}, {6, 6}, {Symmetry::U1(), Symmetry::Zn(2)}));
 
diff --git a/tests/gpu/Bond_test.h b/tests/gpu/Bond_test.h
index abf1dd2f5..8625be037 100644
--- a/tests/gpu/Bond_test.h
+++ b/tests/gpu/Bond_test.h
@@ -1,6 +1,6 @@
 #pragma once
-//#ifndef __CYTNX_TEST_BOND_H__
-//#define __CYTNX_TEST_BOND_H__
+// #ifndef __CYTNX_TEST_BOND_H__
+// #define __CYTNX_TEST_BOND_H__
 #include <time.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/gpu/CMakeLists.txt b/tests/gpu/CMakeLists.txt
index d31d259b9..b60fac0a0 100644
--- a/tests/gpu/CMakeLists.txt
+++ b/tests/gpu/CMakeLists.txt
@@ -6,7 +6,7 @@ add_executable(
   Network_test.cpp
   UniTensor_base_test.cpp
   ncon_test.cpp
-  Contracts_test.cpp
+  Contract_test.cpp
   BlockUniTensor_test.cpp
   DenseUniTensor_test.cpp
   Accessor_test.cpp
diff --git a/tests/gpu/Contract_test.cpp b/tests/gpu/Contract_test.cpp
new file mode 100644
index 000000000..b5e4b3bd5
--- /dev/null
+++ b/tests/gpu/Contract_test.cpp
@@ -0,0 +1,49 @@
+#include "Contract_test.h"
+
+using namespace std;
+using namespace cytnx;
+
+TEST_F(ContractTest, gpu_Contract_denseUt_optimal_order) {
+  UniTensor res = Contract({utdnA, utdnB, utdnC}, "", true);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block(), utdnAns.get_block(), 1e-12));
+}
+
+TEST_F(ContractTest, gpu_Contract_denseUt_default_order) {
+  UniTensor res = Contract({utdnA, utdnB, utdnC}, "", false);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block(), utdnAns.get_block(), 1e-12));
+}
+
+TEST_F(ContractTest, gpu_Contract_denseUt_specified_order) {
+  UniTensor res =
+    Contract({utdnA.set_name("A"), utdnB.set_name("B"), utdnC.set_name("C")}, "(C,(A,B))", false);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block().contiguous(), utdnAns.get_block(), 1e-12));
+}
+
+TEST_F(ContractTest, gpu_Contract_denseUt_optimal_specified_order) {
+  UniTensor res =
+    Contract({utdnA.set_name("A"), utdnB.set_name("B"), utdnC.set_name("C")}, "(C,(A,B))", true);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block().contiguous(), utdnAns.get_block(), 1e-12));
+}
+
+// Deprecated Contracts
+TEST_F(ContractTest, gpu_Contracts_denseUt_optimal_order) {
+  UniTensor res = Contracts({utdnA, utdnB, utdnC}, "", true);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block(), utdnAns.get_block(), 1e-12));
+}
+
+TEST_F(ContractTest, gpu_Contracts_denseUt_default_order) {
+  UniTensor res = Contracts({utdnA, utdnB, utdnC}, "", false);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block(), utdnAns.get_block(), 1e-12));
+}
+
+TEST_F(ContractTest, gpu_Contracts_denseUt_specified_order) {
+  UniTensor res =
+    Contracts({utdnA.set_name("A"), utdnB.set_name("B"), utdnC.set_name("C")}, "(C,(A,B))", false);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block().contiguous(), utdnAns.get_block(), 1e-12));
+}
+
+TEST_F(ContractTest, gpu_Contracts_denseUt_optimal_specified_order) {
+  UniTensor res =
+    Contracts({utdnA.set_name("A"), utdnB.set_name("B"), utdnC.set_name("C")}, "(C,(A,B))", true);
+  EXPECT_TRUE(AreNearlyEqTensor(res.get_block().contiguous(), utdnAns.get_block(), 1e-12));
+}
diff --git a/tests/gpu/Contracts_test.h b/tests/gpu/Contract_test.h
similarity index 92%
rename from tests/gpu/Contracts_test.h
rename to tests/gpu/Contract_test.h
index 9913a910a..c00ff71c9 100644
--- a/tests/gpu/Contracts_test.h
+++ b/tests/gpu/Contract_test.h
@@ -1,5 +1,5 @@
-#ifndef _H_contracts_test
-#define _H_contracts_test
+#ifndef _H_contract_test
+#define _H_contract_test
 
 #include "cytnx.hpp"
 #include <gtest/gtest.h>
@@ -8,7 +8,7 @@
 using namespace cytnx;
 using namespace TestTools;
 
-class ContractsTest : public ::testing::Test {
+class ContractTest : public ::testing::Test {
  public:
   // std::pair<std::vector<cytnx::UniTensor>, std::vector<std::vector<cytnx::cytnx_int64>>> input;
   UniTensor utdnA =
diff --git a/tests/gpu/Contracts_test.cpp b/tests/gpu/Contracts_test.cpp
deleted file mode 100644
index 0e36f1e45..000000000
--- a/tests/gpu/Contracts_test.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "Contracts_test.h"
-
-using namespace std;
-using namespace cytnx;
-
-TEST_F(ContractsTest, gpu_Contracts_denseUt_optimal_order) {
-  UniTensor res = Contracts({utdnA, utdnB, utdnC}, "", true);
-  EXPECT_TRUE(AreNearlyEqTensor(res.get_block(), utdnAns.get_block(), 1e-12));
-}
-
-TEST_F(ContractsTest, gpu_Contracts_denseUt_default_order) {
-  UniTensor res = Contracts({utdnA, utdnB, utdnC}, "", false);
-  EXPECT_TRUE(AreNearlyEqTensor(res.get_block(), utdnAns.get_block(), 1e-12));
-}
-
-TEST_F(ContractsTest, gpu_Contracts_denseUt_specified_order) {
-  UniTensor res =
-    Contracts({utdnA.set_name("A"), utdnB.set_name("B"), utdnC.set_name("C")}, "(C,(A,B))", false);
-  EXPECT_TRUE(AreNearlyEqTensor(res.get_block().contiguous(), utdnAns.get_block(), 1e-12));
-}
-
-TEST_F(ContractsTest, gpu_Contracts_denseUt_optimal_specified_order) {
-  UniTensor res =
-    Contracts({utdnA.set_name("A"), utdnB.set_name("B"), utdnC.set_name("C")}, "(C,(A,B))", true);
-  EXPECT_TRUE(AreNearlyEqTensor(res.get_block().contiguous(), utdnAns.get_block(), 1e-12));
-}
diff --git a/tests/gpu/DenseUniTensor_test.cpp b/tests/gpu/DenseUniTensor_test.cpp
index 9b06ded3f..05e19c8bc 100644
--- a/tests/gpu/DenseUniTensor_test.cpp
+++ b/tests/gpu/DenseUniTensor_test.cpp
@@ -47,6 +47,20 @@ TEST_F(DenseUniTensorTest, gpu_relabels_) {
 }
 
 TEST_F(DenseUniTensorTest, gpu_relabel) {
+  auto tmp = utzero3456.clone();
+  utzero3456 = utzero3456.relabel({"a", "b", "cd", "d"});
+  EXPECT_EQ(utzero3456.labels()[0], "a");
+  EXPECT_EQ(utzero3456.labels()[1], "b");
+  EXPECT_EQ(utzero3456.labels()[2], "cd");
+  EXPECT_EQ(utzero3456.labels()[3], "d");
+  utzero3456 = utzero3456.relabel({"1", "-1", "2", "1000"});
+  EXPECT_THROW(utzero3456.relabel({"a", "a", "b", "c"}), std::logic_error);
+  EXPECT_THROW(utzero3456.relabel({"1", "1", "0", "-1"}), std::logic_error);
+  EXPECT_THROW(utzero3456.relabel({"a"}), std::logic_error);
+  EXPECT_THROW(utzero3456.relabel({"1", "2"}), std::logic_error);
+  EXPECT_THROW(utzero3456.relabel({"a", "b", "c", "d", "e"}), std::logic_error);
+
+  utzero3456 = tmp;
   utzero3456 = utzero3456.relabel("0", "a");
   utzero3456 = utzero3456.relabel("1", "b");
   utzero3456 = utzero3456.relabel("2", "d");
@@ -76,6 +90,20 @@ TEST_F(DenseUniTensorTest, gpu_relabel) {
   // EXPECT_THROW(utzero3456.relabel(5,'a'),std::logic_error);
 }
 TEST_F(DenseUniTensorTest, gpu_relabel_) {
+  auto tmp = utzero3456.clone();
+  utzero3456.relabel_({"a", "b", "cd", "d"});
+  EXPECT_EQ(utzero3456.labels()[0], "a");
+  EXPECT_EQ(utzero3456.labels()[1], "b");
+  EXPECT_EQ(utzero3456.labels()[2], "cd");
+  EXPECT_EQ(utzero3456.labels()[3], "d");
+  utzero3456.relabel_({"1", "-1", "2", "1000"});
+  EXPECT_THROW(utzero3456.relabel_({"a", "a", "b", "c"}), std::logic_error);
+  EXPECT_THROW(utzero3456.relabel_({"1", "1", "0", "-1"}), std::logic_error);
+  EXPECT_THROW(utzero3456.relabel_({"a"}), std::logic_error);
+  EXPECT_THROW(utzero3456.relabel_({"1", "2"}), std::logic_error);
+  EXPECT_THROW(utzero3456.relabel_({"a", "b", "c", "d", "e"}), std::logic_error);
+
+  utzero3456 = tmp;
   utzero3456.relabel_("0", "a");
   utzero3456.relabel_("1", "b");
   utzero3456.relabel_("2", "d");
diff --git a/tests/gpu/OLDtest.cpp b/tests/gpu/OLDtest.cpp
index 8bd91afcd..63e5d8667 100644
--- a/tests/gpu/OLDtest.cpp
+++ b/tests/gpu/OLDtest.cpp
@@ -4,7 +4,7 @@
 #include <functional>
 #include <type_traits>
 #include "hptt.h"
-//#include "cutt.h"
+// #include "cutt.h"
 using namespace std;
 using namespace cytnx;
 
@@ -199,8 +199,8 @@ int main(int argc, char *argv[]) {
   auto Tc = zeros({4, 1, 1});
   Tc.Conj_();
 
-  auto L0 = UniTensor(zeros({4, 1, 1}), 0);  //#Left boundary
-  auto R0 = UniTensor(zeros({4, 1, 1}), 0);  //#Right boundary
+  auto L0 = UniTensor(zeros({4, 1, 1}), 0);  // #Left boundary
+  auto R0 = UniTensor(zeros({4, 1, 1}), 0);  // #Right boundary
   L0.get_block_()(0, 0, 0) = 1.;
   R0.get_block_()(3, 0, 0) = 1.;
 
diff --git a/tests/gpu/utils/getNconParameter.cpp b/tests/gpu/utils/getNconParameter.cpp
index 2cf35da61..a16b5ac7c 100644
--- a/tests/gpu/utils/getNconParameter.cpp
+++ b/tests/gpu/utils/getNconParameter.cpp
@@ -2,11 +2,11 @@
 
 using namespace std;
 using namespace cytnx;
-//#define int long long
+// #define int long long
 #define rep(i, a, n) for (int i = a; i < n; i++)
 #define per(i, a, n) for (int i = n - 1; i >= a; i--)
 #define pb push_back
-//#define mp make_pair
+// #define mp make_pair
 #define all(x) (x).begin(), (x).end()
 #define fi first
 #define se second
diff --git a/tests/linalg_test/Arnoldi_Ut_test.cpp b/tests/linalg_test/Arnoldi_Ut_test.cpp
new file mode 100644
index 000000000..1e2c07c4c
--- /dev/null
+++ b/tests/linalg_test/Arnoldi_Ut_test.cpp
@@ -0,0 +1,310 @@
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include "cytnx.hpp"
+
+using namespace cytnx;
+using namespace testing;
+
+namespace ArnoldiUtTest {
+
+  cytnx_double tolerance = 1.0e-9;
+
+  /*
+   *   "al"+---A--- "ar"               +-"al"
+   *       |   |                       |
+   *      (l)  |"phys"    = lambda_i  (l)
+   *       |   |                       |
+   *   "bl"+---B--- "br"               +-"bl"
+   */
+  // define the Transfer matrix LinOp
+  class TMOp : public LinOp {
+   public:
+    UniTensor A, B;
+    UniTensor T_init;
+    TMOp(const int& d, const int& D, const cytnx_uint64& nx,
+         const unsigned int& dtype = Type.Double, const int& device = Device.cpu);
+    UniTensor matvec(const UniTensor& l) override {
+      auto tmp = Contracts({A, l, B}, "", true);
+      tmp.relabels_(l.labels()).set_rowrank(l.rowrank());
+      return tmp;
+    }
+
+    // only for test
+    /*
+     *   "al"+---A--- "ar"
+     *           |
+     *           |"phys"
+     *           |
+     *   "bl"+---B--- "br"
+     */
+    Tensor GetOpAsMat() {
+      int D = A.shape()[0];
+      auto tmp = Contract(A, B);
+      auto mat = tmp.permute({"al", "bl", "ar", "br"}, 2).get_block_().reshape(D * D, D * D);
+      return mat;
+    }
+  };
+  TMOp::TMOp(const int& d, const int& D, const cytnx_uint64& in_nx, const unsigned int& in_dtype,
+             const int& in_device)
+      : LinOp("mv", in_nx, in_dtype, in_device) {
+    std::vector<Bond> bonds = {Bond(D), Bond(d), Bond(D)};
+    A = UniTensor(bonds, {}, -1, in_dtype, in_device)
+          .set_name("A")
+          .relabels_({"al", "phys", "ar"})
+          .set_rowrank(2);
+    B = UniTensor(bonds, {}, -1, in_dtype, in_device)
+          .set_name("B")
+          .relabels_({"bl", "phys", "br"})
+          .set_rowrank(2);
+    T_init = UniTensor({Bond(D), Bond(D)}, {}, -1, in_dtype, in_device)
+               .set_name("l")
+               .relabels_({"al", "bl"})
+               .set_rowrank(1);
+    if (Type.is_float(this->dtype())) {
+      double low = -1.0, high = 1.0;
+      int seed = 0;
+      A.uniform_(low, high, seed);
+      B.uniform_(low, high, seed);
+      T_init.uniform_(low, high, seed);
+    }
+  }
+
+  // the function to check the answer
+  bool CheckResult(TMOp& H, const std::vector<UniTensor>& arnoldi_eigs, const std::string& which,
+                   const cytnx_uint64 k);
+
+  void ExcuteTest(const std::string& which, const int& mat_type = Type.ComplexDouble,
+                  const cytnx_uint64& k = 3) {
+    int D = 5, d = 2;
+    int dim = D * D;
+    TMOp H = TMOp(d, D, dim, mat_type);
+    const cytnx_uint64 maxiter = 10000;
+    const cytnx_double cvg_crit = tolerance;
+    std::vector<UniTensor> arnoldi_eigs =
+      linalg::Arnoldi(&H, H.T_init, which, maxiter, cvg_crit, k);
+    H.GetOpAsMat();
+    bool is_pass = CheckResult(H, arnoldi_eigs, which, k);
+    EXPECT_TRUE(is_pass);
+  }
+
+  // corrected test
+  // 1-1, test for 'which' = 'LM'
+  TEST(Arnoldi_Ut, which_LM_test) {
+    std::string which = "LM";
+    ExcuteTest(which);
+  }
+
+  // 1-2, test for 'which' = 'LR'
+  TEST(Arnoldi_Ut, which_LR_test) {
+    std::string which = "LR";
+    ExcuteTest(which);
+  }
+
+  // 1-3, test for 'which' = 'LI'
+  TEST(Arnoldi_Ut, which_LI_test) {
+    std::string which = "LI";
+    ExcuteTest(which);
+  }
+
+  // 1-4, test for 'which' = 'SR'
+  TEST(Arnoldi_Ut, which_SR_test) {
+    std::string which = "SR";
+    ExcuteTest(which);
+  }
+
+  // 1-5, test for 'which' = 'SI'
+  TEST(Arnoldi_Ut, which_SI_test) {
+    std::string which = "SI";
+    ExcuteTest(which);
+  }
+
+  // 1-6, test matrix is real type
+  TEST(Arnoldi_Ut, mat_type_real_test) {
+    std::string which = "LM";
+    auto mat_type = Type.Double;
+    ExcuteTest(which, mat_type);
+  }
+
+  // 1-7, test eigenalue number k = 1
+  TEST(Arnoldi_Ut, k1_test) {
+    std::string which = "LM";
+    auto mat_type = Type.ComplexDouble;
+    cytnx_uint64 k = 1;
+    ExcuteTest(which, mat_type, k);
+  }
+
+  // 1-8, test eigenalue number k match maximum, that means k = dim.
+  TEST(Arnoldi_Ut, k_max) {
+    std::string which = "LM";
+    auto mat_type = Type.ComplexDouble;
+    cytnx_uint64 k;
+    k = 25;  // D*D
+    ExcuteTest(which, mat_type, k);
+  }
+
+  // 1-9, test the smallest matrix dimenstion.
+  TEST(Arnoldi_Ut, smallest_dim) {
+    std::string which = "LM";
+    auto mat_type = Type.ComplexDouble;
+    cytnx_uint64 k;
+    k = 1;
+    ExcuteTest(which, mat_type, k);
+  }
+
+  // 1-10, test 'is_V' is false
+  // 1-11, test 'v_bose' is true
+  // 1-12, test converge criteria is large such that the iteration time may not reach 'k'
+
+  // error test
+  class ErrorTestClass {
+   public:
+    std::string which = "LM";
+    cytnx_uint64 k = 1;
+    cytnx_uint64 maxiter = 10;
+    int d = 2, D = 5;
+    cytnx_double cvg_crit = tolerance;
+    ErrorTestClass(){};
+    void ExcuteErrorTest();
+    cytnx_uint64 dim = D * D;
+    // set
+    void Set_dim(const int _dim) {
+      dim = _dim;
+      H = TMOp(d, D, dim);
+    }
+    void Set_mat_type(const int _mat_type) {
+      mat_type = _mat_type;
+      H = TMOp(d, D, dim, mat_type);
+    }
+
+   private:
+    int mat_type = Type.ComplexDouble;
+    TMOp H = TMOp(d, D, dim, mat_type);
+  };
+  void ErrorTestClass::ExcuteErrorTest() {
+    try {
+      auto arnoldi_eigs = linalg::Arnoldi(&H, H.T_init, which, maxiter, cvg_crit, k);
+      FAIL();
+    } catch (const std::exception& ex) {
+      auto err_msg = ex.what();
+      std::cerr << err_msg << std::endl;
+      SUCCEED();
+    }
+  }
+
+  // 2-1, test for wrong input 'which'
+  TEST(Arnoldi_Ut, err_which) {
+    ErrorTestClass err_task1;
+    err_task1.which = "ML";
+    err_task1.ExcuteErrorTest();
+  }
+
+  // 2-2, test SM is not support for UniTensor
+  TEST(Arnoldi_Ut, err_which_SM) {
+    ErrorTestClass err_task;
+    err_task.which = "SM";
+    err_task.ExcuteErrorTest();
+  }
+
+  // 2-3, test for wrong input LinOp dtype
+  TEST(Arnoldi_Ut, err_mat_type) {
+    ErrorTestClass err_task;
+    err_task.Set_mat_type(Type.Int64);
+    err_task.ExcuteErrorTest();
+  }
+
+  // 2-4, test for 'k' = 0
+  TEST(Arnoldi_Ut, err_zero_k) {
+    ErrorTestClass err_task;
+    err_task.k = 0;
+    err_task.ExcuteErrorTest();
+  }
+
+  // 2-5, test for 'k' > 'max_iter'
+  TEST(Arnoldi_Ut, err_k_too_large) {
+    ErrorTestClass err_task;
+    err_task.k = err_task.dim + 1;
+    err_task.ExcuteErrorTest();
+  }
+
+  // 2-6, test cvg_crit <= 0
+  TEST(Arnoldi_Ut, err_crit_negative) {
+    ErrorTestClass err_task;
+    err_task.cvg_crit = -0.001;
+    err_task.ExcuteErrorTest();
+  }
+
+  // 2-7, test nx not match
+  TEST(Arnoldi_Ut, nx_not_match) {
+    ErrorTestClass err_task;
+    err_task.Set_dim(5);
+    err_task.ExcuteErrorTest();
+  }
+
+  // For given 'which' = 'LM', 'SR', ...etc, sort the given eigenvalues.
+  bool cmpNorm(const Scalar& l, const Scalar& r) { return abs(l) < abs(r); }
+  bool cmpReal(const Scalar& l, const Scalar& r) { return l.real() < r.real(); }
+  bool cmpImag(const Scalar& l, const Scalar& r) { return l.imag() < r.imag(); }
+  std::vector<Scalar> OrderEigvals(const Tensor& eigvals, const std::string& order_type) {
+    char small_or_large = order_type[0];  //'S' or 'L'
+    char metric_type = order_type[1];  //'M', 'R' or 'I'
+    auto eigvals_len = eigvals.shape()[0];
+    auto ordered_eigvals = std::vector<Scalar>(eigvals_len, Scalar());
+    for (cytnx_uint64 i = 0; i < eigvals_len; ++i) ordered_eigvals[i] = eigvals.storage().at(i);
+    std::function<bool(const Scalar&, const Scalar&)> cmpFncPtr;
+    if (metric_type == 'M') {
+      cmpFncPtr = cmpNorm;
+    } else if (metric_type == 'R') {
+      cmpFncPtr = cmpReal;
+    } else if (metric_type == 'I') {
+      cmpFncPtr = cmpImag;
+    } else {  // wrong input
+      ;
+    }
+    // sort eigenvalues
+    if (small_or_large == 'S') {
+      std::stable_sort(ordered_eigvals.begin(), ordered_eigvals.end(), cmpFncPtr);
+    } else {  // 'L'
+      std::stable_sort(ordered_eigvals.rbegin(), ordered_eigvals.rend(), cmpFncPtr);
+    }
+    return ordered_eigvals;
+  }
+
+  // get resigue |Hv - ev|
+  Scalar GetResidue(TMOp& H, const Scalar& eigval, const UniTensor& eigvec) {
+    UniTensor resi_vec = H.matvec(eigvec) - eigval * eigvec;
+    Scalar resi = resi_vec.Norm().item();
+    return resi;
+  }
+
+  // compare the arnoldi results with full spectrum (calculated by the function Eig.)
+  bool CheckResult(TMOp& H, const std::vector<UniTensor>& arnoldi_eigs, const std::string& which,
+                   const cytnx_uint64 k) {
+    // get full spectrum (eigenvalues)
+    std::vector<Tensor> full_eigs = linalg::Eig(H.GetOpAsMat());
+    Tensor full_eigvals = full_eigs[0];
+    std::vector<Scalar> ordered_eigvals = OrderEigvals(full_eigvals, which);
+    auto fst_few_eigvals =
+      std::vector<Scalar>(ordered_eigvals.begin(), ordered_eigvals.begin() + k);
+    Tensor arnoldi_eigvals = arnoldi_eigs[0].get_block_();
+
+    // check the number of the eigenvalues
+    cytnx_uint64 arnoldi_eigvals_len = arnoldi_eigvals.shape()[0];
+    if (arnoldi_eigvals_len != k) return false;
+    for (cytnx_uint64 i = 0; i < k; ++i) {
+      auto arnoldi_eigval = arnoldi_eigvals.storage().at(i);
+      // if k == 1, arnoldi_eigvecs will be a rank-1 tensor
+      auto arnoldi_eigvec = arnoldi_eigs[i + 1];
+      auto exact_eigval = fst_few_eigvals[i];
+      // check eigen value by comparing with the full spectrum results.
+      // avoid, for example, arnoldi_eigval = 1 + 3j, exact_eigval = 1 - 3j, which = 'LM'
+      auto eigval_err = abs(arnoldi_eigval) - abs(exact_eigval);
+      if (eigval_err >= tolerance) return false;
+      // check the is the eigenvector correct
+      auto resi_err = GetResidue(H, arnoldi_eigval, arnoldi_eigvec);
+      if (resi_err >= tolerance) return false;
+      // check phase
+    }
+    return true;
+  }
+
+}  // namespace ArnoldiUtTest
diff --git a/tests/linalg_test/Arnoldi_test.cpp b/tests/linalg_test/Arnoldi_test.cpp
index ad9fdf031..dafa20c4a 100644
--- a/tests/linalg_test/Arnoldi_test.cpp
+++ b/tests/linalg_test/Arnoldi_test.cpp
@@ -129,12 +129,9 @@ namespace ArnoldiTest {
     cytnx_uint64 maxiter = 10000;
     cytnx_double cvg_crit = tolerance;
     ErrorTestClass(){};
+    cytnx_uint64 dim = 3;
     void ExcuteErrorTest();
     // set
-    void Set_dim(const cytnx_uint64 _dim) {
-      dim = _dim;
-      H = MatOp(dim, mat_type);
-    }
     void Set_mat_type(const int _mat_type) {
       mat_type = _mat_type;
       H = MatOp(dim, mat_type);
@@ -142,7 +139,6 @@ namespace ArnoldiTest {
 
    private:
     int mat_type = Type.ComplexDouble;
-    cytnx_uint64 dim = 3;
     MatOp H = MatOp(dim, mat_type);
   };
   void ErrorTestClass::ExcuteErrorTest() {
@@ -180,7 +176,7 @@ namespace ArnoldiTest {
   // 2-4, test for 'k' > 'max_iter'
   TEST(Arnoldi, err_k_too_large) {
     ErrorTestClass err_task;
-    err_task.k = 0;
+    err_task.k = err_task.dim + 1;
     err_task.ExcuteErrorTest();
   }
 
diff --git a/tests/linalg_test/Lanczos_Exp_test.cpp b/tests/linalg_test/Lanczos_Exp_test.cpp
new file mode 100644
index 000000000..5b98d1b6c
--- /dev/null
+++ b/tests/linalg_test/Lanczos_Exp_test.cpp
@@ -0,0 +1,190 @@
+#include "cytnx.hpp"
+#include <gtest/gtest.h>
+#include "../test_tools.h"
+
+using namespace cytnx;
+using namespace testing;
+using namespace TestTools;
+
+namespace Lanczos_Exp_Ut_Test {
+
+  UniTensor CreateOneSiteEffHam(const int d, const int D, const unsigned int dypte = Type.Double,
+                                const int device = Device.cpu);
+  UniTensor CreateA(const int d, const int D, const unsigned int dtype = Type.Double,
+                    const int device = Device.cpu);
+  UniTensor GetAns(const UniTensor& HEff, const UniTensor& Tin, const Scalar& tau);
+  Scalar Dot(const UniTensor& A, const UniTensor& B) { return Contract(A.Dagger(), B).item(); }
+  class OneSiteOp : public LinOp {
+   public:
+    OneSiteOp(const int d, const int D, const unsigned int dtype = Type.Double,
+              const int& device = Device.cpu)
+        : LinOp("mv", D * D, dtype, device) {
+      EffH = CreateOneSiteEffHam(d, D, dtype, device);
+    }
+    UniTensor EffH;
+
+    /*
+     *         |-|--"vil" "pi" "vir"--|-|                        |-|--"vil" "pi" "vir"--|-|
+     *         | |         +          | |             "po"       | |         +          | |
+     *         |L|- -------O----------|R|  dot         |       = |L|- -------O----------|R|
+     *         | |         +          | |       "vol"--A--"vor"  | |         +          | |
+     *         |_|--"vol" "po" "vor"--|_|                        |_|---------A----------|_|
+     *
+     * Then relabels ["vil", "pi", "vir"] -> ["vol", "po", "vor"]
+     *
+     * "vil":virtual in bond left
+     * "po":physical out bond
+     */
+    UniTensor matvec(const UniTensor& A) override {
+      auto tmp = Contract(EffH, A);
+      tmp.permute_({"vil", "pi", "vir"}, 1);
+      tmp.relabels_(A.labels());
+      return tmp;
+    }
+  };
+
+  // describe:Real type test
+  TEST(Lanczos_Exp_Ut, RealTypeTest) {
+    int d = 2, D = 5;
+    auto op = OneSiteOp(d, D);
+    auto Tin = CreateA(d, D);
+    const double crit = 1.0e-10;
+    double tau = 0.1;
+    auto x = linalg::Lanczos_Exp(&op, Tin, tau, crit);
+    auto ans = GetAns(op.EffH, Tin, tau);
+    auto err = static_cast<double>((x - ans).Norm().item().real());
+    EXPECT_TRUE(err <= crit);
+  }
+
+  // describe:Complex type test
+  TEST(Lanczos_Exp_Ut, ComplexTypeTest) {
+    int d = 2, D = 5;
+    auto op = OneSiteOp(d, D, Type.ComplexDouble);
+    auto Tin = CreateA(d, D, Type.ComplexDouble);
+    const double crit = 1.0e-9;
+    std::complex<double> tau = std::complex<double>(0, 1) * 0.1;
+    auto x = linalg::Lanczos_Exp(&op, Tin, tau, crit);
+    auto ans = GetAns(op.EffH, Tin, tau);
+    auto err = static_cast<double>((x - ans).Norm().item().real());
+    EXPECT_TRUE(err <= crit);
+  }
+
+  // describe:Test non-Hermitian Op but the code will not crash
+  TEST(Lanczos_Exp_Ut, NonHermit) {
+    int d = 2, D = 5;
+    double low = -1.0, high = 1.0;
+    auto op = OneSiteOp(d, D);
+    op.EffH.uniform_(low, high, 0);
+    auto Tin = CreateA(d, D);
+    const double crit = 1.0e-3;
+    double tau = 0.1;
+    auto x = linalg::Lanczos_Exp(&op, Tin, tau, crit);
+  }
+
+  // describe:input |v| != 1
+  TEST(Lanczos_Exp_Ut, normVNot1) {
+    int d = 2, D = 5;
+    auto op = OneSiteOp(d, D);
+    auto Tin = CreateA(d, D) * 1.1;
+    const double crit = 1.0e-7;
+    double tau = 0.1;
+    auto x = linalg::Lanczos_Exp(&op, Tin, tau, crit);
+    auto ans = GetAns(op.EffH, Tin, tau);
+    auto err = static_cast<double>((x - ans).Norm().item().real());
+    EXPECT_TRUE(err <= crit);
+  }
+
+  // describe:test incorrect data type
+  TEST(Lanczos_Exp_Ut, IncorrectDType) {
+    int d = 2, D = 10;
+    auto op = OneSiteOp(d, D, Type.Int64);
+    auto Tin = CreateA(d, D, Type.Int64);
+    const double crit = 1.0e-3;
+    double tau = 0.1;
+    EXPECT_THROW({ linalg::Lanczos_Exp(&op, Tin, crit, tau); }, std::logic_error);
+  }
+
+  // describe:test not supported UniTensor Type
+
+  /*
+   *     -1
+   *     |
+   *  0--A--2
+   */
+  UniTensor CreateA(const int d, const int D, const unsigned int dtype, const int device) {
+    double low = -1.0, high = 1.0;
+    UniTensor A = UniTensor({Bond(D), Bond(d), Bond(D)}, {}, -1, dtype, device)
+                    .set_name("A")
+                    .relabels_({"vol", "po", "vor"})
+                    .set_rowrank_(1);
+    if (Type.is_float(A.dtype())) {
+      random::uniform_(A, low, high, 0);
+    }
+    // A = A / std::sqrt(double((Dot(A, A).real())));
+    return A;
+  }
+
+  /*
+   *         |-|--"vil" "pi" "vir"--|-|
+   *         | |         +          | |
+   *         |L|- -------O----------|R|
+   *         | |         +          | |
+   *         |_|--"vol" "po" "vor"--|_|
+   */
+  UniTensor CreateOneSiteEffHam(const int d, const int D, const unsigned int dtype,
+                                const int device) {
+    double low = -1.0, high = 1.0;
+    std::vector<Bond> bonds = {Bond(D), Bond(d), Bond(D), Bond(D), Bond(d), Bond(D)};
+    std::vector<std::string> heff_labels = {"vil", "pi", "vir", "vol", "po", "vor"};
+    UniTensor HEff = UniTensor(bonds, {}, -1, dtype, device)
+                       .set_name("HEff")
+                       .relabels_(heff_labels)
+                       .set_rowrank(bonds.size() / 2);
+    auto HEff_shape = HEff.shape();
+    auto in_dim = 1;
+    for (int i = 0; i < HEff.rowrank(); ++i) {
+      in_dim *= HEff_shape[i];
+    }
+    auto out_dim = in_dim;
+    if (Type.is_float(HEff.dtype())) {
+      random::uniform_(HEff, low, high, 0);
+    }
+    auto HEff_mat = HEff.get_block();
+    HEff_mat.reshape_({in_dim, out_dim});
+    HEff_mat = HEff_mat + HEff_mat.permute({1, 0});  // symmtrize
+
+    // Let H can be converge in ExpM
+    auto eigs = HEff_mat.Eigh();
+    auto e = UniTensor(eigs[0], true) * 0.01;
+    e.set_labels({"a", "b"});
+    auto v = UniTensor(eigs[1]);
+    v.set_labels({"i", "a"});
+    auto vt = UniTensor(linalg::InvM(v.get_block()));
+    vt.set_labels({"b", "j"});
+    HEff_mat = Contract(Contract(e, v), vt).get_block();
+
+    // HEff_mat = linalg::Matmul(HEff_mat, HEff_mat.permute({1, 0}).Conj());  // positive definete
+    HEff_mat.reshape_(HEff_shape);
+    HEff.put_block(HEff_mat);
+    return HEff;
+  }
+
+  UniTensor GetAns(const UniTensor& HEff, const UniTensor& Tin, const Scalar& tau) {
+    auto expH = HEff.clone();
+    auto HEff_shape = HEff.shape();
+    auto in_dim = 1;
+    for (int i = 0; i < HEff.rowrank(); ++i) {
+      in_dim *= HEff_shape[i];
+    }
+    auto out_dim = in_dim;
+    // we use ExpM since tau*H will not be Hermitian if tau is complex number even H is Hermitian
+    expH.put_block(
+      linalg::ExpM((tau * expH.get_block()).reshape(in_dim, out_dim)).reshape(HEff_shape));
+    auto ans = Contract(expH, Tin);
+    ans.permute_({"vil", "pi", "vir"}, 1);
+    ans.relabels_(Tin.labels());
+    ans = Contract(expH, Tin);
+    return ans;
+  }
+
+}  // namespace Lanczos_Exp_Ut_Test
diff --git a/tests/utils/getNconParameter.cpp b/tests/utils/getNconParameter.cpp
index 3450e31bd..29513168a 100644
--- a/tests/utils/getNconParameter.cpp
+++ b/tests/utils/getNconParameter.cpp
@@ -2,11 +2,11 @@
 
 using namespace std;
 using namespace cytnx;
-//#define int long long
+// #define int long long
 #define rep(i, a, n) for (int i = a; i < n; i++)
 #define per(i, a, n) for (int i = n - 1; i >= a; i--)
 #define pb push_back
-//#define mp make_pair
+// #define mp make_pair
 #define all(x) (x).begin(), (x).end()
 #define fi first
 #define se second