diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000..05df2ca
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,25 @@
+name: Build and test cpu
+on:
+  push:
+    paths-ignore:
+      - '**.md'
+      - 'LICENSE'
+  pull_request:
+    paths:
+      - '**.md'
+      - 'LICENSE'
+
+jobs:
+  build:
+    name: Build
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        submodules: recursive
+
+    - name: Build
+      run: make
+
+    - name: Test cpu
+      run: make test-cpp
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..98e980a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,46 @@
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+build/
+build_debug/
+
+.vscode/
+
+# python
+*.egg-info
+*.pyc
+
+# onnx model
+*.onnx
+*.pb
+*.npy
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..e856b94
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "3rd-party/nlohmann_json_cmake_fetchcontent"]
+	path = 3rd-party/nlohmann_json_cmake_fetchcontent
+	url = git@github.com:ArthurSonzogni/nlohmann_json_cmake_fetchcontent.git
+[submodule "3rd-party/googletest"]
+	path = 3rd-party/googletest
+	url = git@github.com:google/googletest.git
diff --git a/3rd-party/googletest b/3rd-party/googletest
new file mode 160000
index 0000000..3e3b44c
--- /dev/null
+++ b/3rd-party/googletest
@@ -0,0 +1 @@
+Subproject commit 3e3b44c300b21eb996a2957782421bc0f157af18
diff --git a/3rd-party/nlohmann_json_cmake_fetchcontent b/3rd-party/nlohmann_json_cmake_fetchcontent
new file mode 160000
index 0000000..326308d
--- /dev/null
+++ b/3rd-party/nlohmann_json_cmake_fetchcontent
@@ -0,0 +1 @@
+Subproject commit 326308d7512d2168ae00199aec4dd0f714526e89
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..836a7e0
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,96 @@
+# Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
+option(BUILD_TEST "Build tests" OFF)
+
+cmake_minimum_required(VERSION 3.17)
+
+include(CMakeDependentOption)
+project(InfiniTensor C CXX)
+
+cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
+
+set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
+# Build Type
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    message("Configuring for Debug build.")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
+    add_compile_definitions(DEBUG_MODE)
+elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
+    message("Configuring for Release build.")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
+    add_compile_definitions(NDEBUG)
+elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
+    message("Configuring for RelWithDebInfo build.")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
+else()
+    message("Build type not specified. Configuring for RelWithDebInfo build.")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
+endif()
+
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
+add_compile_options(-Wno-error=unused-variable)
+
+find_package(
+  Python
+  COMPONENTS Interpreter Development
+  REQUIRED)
+
+# OpenMP
+find_package(OpenMP)
+if(OpenMP_C_FOUND)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+endif()
+if(OpenMP_CXX_FOUND)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+
+include_directories(include)
+
+if(BUILD_TEST)
+  set(BUILD_GMOCK
+      OFF
+      CACHE BOOL "Do not build gmock" FORCE)
+  set(INSTALL_GTEST
+      OFF
+      CACHE BOOL "Do not install gtest" FORCE)
+  add_subdirectory(3rd-party/googletest)
+  include_directories(3rd-party/googletest/googletest/include)
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations -Wno-error=pointer-arith")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion
+
+
+# Source files
+file(GLOB_RECURSE SRC src/core/*.cc src/kernels/cpu/*.cc src/operators/*.cc src/utils/*.cc)
+
+if(USE_INTELCPU)
+  file(GLOB_RECURSE SRC_INTELCPU src/intelcpu/*.cc src/kernels/intelcpu/*.cc )
+  list (APPEND SRC ${SRC_INTELCPU})
+endif()
+
+# Libraries
+add_library(InfiniTensor SHARED ${SRC})
+
+function(build_test files)
+  # Non-recursive glob for skip failed tests
+  file(GLOB TEST_SOURCES ${files})
+  foreach(testsourcefile ${TEST_SOURCES})
+    get_filename_component(testname ${testsourcefile} NAME_WE)
+    add_executable(${testname} ${testsourcefile})
+    target_link_libraries(${testname} InfiniTensor GTest::gtest_main)
+    add_test(NAME ${testname} COMMAND ${testname})
+  endforeach(testsourcefile ${TEST_SOURCES})
+endfunction()
+
+if(BUILD_TEST)
+  add_compile_definitions(BUILD_TEST=1)
+  enable_testing()
+  if(BUILD_TEST_CORE)
+    build_test(test/core/*.cc)
+    build_test(test/operators/*.cc)
+    build_test(test/kernels/nativecpu/*.cc)
+  endif()
+endif()
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..35ef7ef
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,18 @@
+﻿.PHONY : build clean format install-python test-cpp test-onnx
+
+TYPE ?= Release
+TEST ?= ON
+
+CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
+CMAKE_OPT += -DBUILD_TEST=$(TEST)
+
+build:
+	mkdir -p build/$(TYPE)
+	cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j8
+
+clean:
+	rm -rf build
+
+test-cpp:
+	@echo
+	cd build/$(TYPE) && make test
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7bbac1c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,7 @@
+# TinyInfiniTensor
+
+一个简化版的 ai compiler，用于初学者快速上手学习，保留了计算图和 kernel 层的概念，能够基于 c++ 搭建计算图进行推理计算，目前只支持 cpu 平台。
+
+[环境部署文档](docs/项目部署.md)
+
+[训练营作业介绍文档](docs/训练营作业介绍.md)
\ No newline at end of file
diff --git "a/docs/\350\256\255\347\273\203\350\220\245\344\275\234\344\270\232\344\273\213\347\273\215.md" "b/docs/\350\256\255\347\273\203\350\220\245\344\275\234\344\270\232\344\273\213\347\273\215.md"
new file mode 100644
index 0000000..f3a1f7b
--- /dev/null
+++ "b/docs/\350\256\255\347\273\203\350\220\245\344\275\234\344\270\232\344\273\213\347\273\215.md"
@@ -0,0 +1,247 @@
+# 本地自测
+
+运行 ``make test-cpp``，通过所有测例即为完成作业，每个测例 10 分。
+
+1. test_allocator：依赖作业一
+2. test_cast：依赖作业四
+3. test_clip：依赖作业三
+4. test_concat：依赖作业五
+5. test_element_wise：依赖作业六
+6. test_transpose：依赖作业二
+7. test_nativecpu_concat：依赖作业一、作业五
+8. test_nativecpu_elementwise：依赖作业一、作业六
+9. test_nativecpu_transpose：依赖作业一、作业二
+10. test_matmul：依赖作业六、作业七
+11. test_graph：依赖作业八
+
+# 作业题目
+
+## 作业一：内存分配器
+
+难度：⭐⭐⭐⭐
+
+对应测例：``test_allocator``，``test_nativecpu_concat``，``test_nativecpu_elementwise``，``test_nativecpu_transpose``
+
+### step1 完善 allocator 声明
+
+需要实现的代码块位置：`include/core/allocator.h`
+
+````c++
+// =================================== 作业 ===================================
+// TODO：可能需要设计一个数据结构来存储free block，以便于管理和合并
+// HINT: 可以使用一个 map 来存储 free block，key 为 block 的起始/结尾地址，value 为 block 的大小
+// =================================== 作业 ===================================
+````
+
+### step2 完善 allocator 定义
+
+需要实现的代码块位置：`src/core/allocator.cc`
+
+完善分配函数：
+
+````c++
+    size_t Allocator::alloc(size_t size)
+    {
+        IT_ASSERT(this->ptr == nullptr);
+        // pad the size to the multiple of alignment
+        size = this->getAlignedSize(size);
+
+        // =================================== 作业 ===================================
+        // TODO: 设计一个算法来分配内存，返回起始地址偏移量
+        // =================================== 作业 ===================================
+
+        return 0;
+    }
+````
+
+完善释放函数：
+
+````c++
+    void Allocator::free(size_t addr, size_t size)
+    {
+        IT_ASSERT(this->ptr == nullptr);
+        size = getAlignedSize(size);
+
+        // =================================== 作业 ===================================
+        // TODO: 设计一个算法来回收内存
+        // =================================== 作业 ===================================
+    }
+````
+
+### step3 计算图的内存分配部分
+
+需要实现的代码块位置：`src/core/graph.cc`
+
+完善计算图的内存分配部分：
+
+````c++
+    void GraphObj::dataMalloc()
+    {
+        // topological sorting first
+        IT_ASSERT(topo_sort() == true);
+
+        // =================================== 作业 ===================================
+        // TODO：利用 allocator 给计算图分配内存
+        // HINT: 获取分配好的内存指针后，可以调用 tensor 的 setDataBlob 函数给 tensor 绑定内存
+        // =================================== 作业 ===================================
+
+        allocator.info();
+    }
+````
+
+## 作业二：transpose 算子形状推导
+
+难度：⭐
+
+对应测例：``test_transpose``，``test_nativecpu_transpose``
+
+需要实现的代码块位置：`src/operators/transpose.cc`
+
+````c++
+    optional<vector<Shape>> TransposeObj::inferShape(const TensorVec &inputs)
+    {
+        const auto A = inputs[0];
+        auto input_dim = A->getDims();
+        auto output_dim = input_dim;
+        int rank = A->getRank();
+
+        // =================================== 作业 ===================================
+        // TODO：修改 output_dim，返回正确的 transpose 后的 shape
+        // REF: https://onnx.ai/onnx/operators/onnx__Transpose.html#transpose-21
+        // =================================== 作业 ===================================
+
+        return {{}};
+    }
+
+````
+
+## 作业三：clip 算子形状推导
+
+难度：⭐
+
+对应测例：``test_clip``
+
+需要实现的代码块位置：`src/operators/unary.cc`
+
+````c++
+    optional<vector<Shape>> ClipObj::inferShape(const TensorVec &inputs)
+    {
+        // =================================== 作业 ===================================
+        // TODO：返回经过 clip 操作后的 shape
+        // REF: https://onnx.ai/onnx/operators/onnx__Clip.html#clip-13
+        // =================================== 作业 ===================================
+        return {{}};
+    }
+````
+
+## 作业四：cast 算子形状推导 & 数据类型推导
+
+难度：⭐⭐
+
+对应测例：``test_cast``
+
+需要实现的代码块位置：`src/operators/unary.cc`
+
+````c++
+    vector<DataType> CastObj::inferDataType(const TensorVec &inputs) const
+    {
+        // =================================== 作业 ===================================
+        // TODO：返回经过 cast 操作后, 输出 tensor 的数目和数据类型
+        // REF_FILE: src/core/operator.cc
+        // REF: https://onnx.ai/onnx/operators/onnx__Cast.html#cast-21
+        // =================================== 作业 ===================================
+        return {};
+    }
+
+    optional<vector<Shape>> CastObj::inferShape(const TensorVec &inputs)
+    {
+        // =================================== 作业 ===================================
+        // TODO：返回经过 cast 操作后的 shape
+        // REF: https://onnx.ai/onnx/operators/onnx__Cast.html#cast-21
+        // =================================== 作业 ===================================
+        return {{}};
+    }
+````
+
+
+
+## 作业五：concat 算子形状推导
+
+难度：⭐⭐
+
+对应测例：``test_concat``，``test_nativecpu_concat``
+
+需要实现的代码块位置：`src/operators/concat.cc`
+
+````c++
+optional<vector<Shape>> ConcatObj::inferShape(const TensorVec &inputs) {
+    Shape dims = inputs[0]->getDims();
+    auto rank = inputs[0]->getRank();
+
+    // =================================== 作业 ===================================
+    // TODO：修改 dims，返回正确的 concat 后的 shape
+    // REF: https://onnx.ai/onnx/operators/onnx__Concat.html#concat-13
+    // =================================== 作业 ===================================
+
+    return {{dims}};
+}
+````
+
+## 作业六：双向广播
+
+难度：⭐⭐⭐
+
+对应测例：``test_element_wise``,``test_nativecpu_elementwise``,``test_matmul``
+
+需要实现的代码块位置：`src/utils/operator_utils.cc`
+
+````c++
+Shape infer_broadcast(const Shape &A, const Shape &B) {
+
+    // =================================== 作业 ===================================
+    // TODO：对 A 和 B 进行双向广播，返回广播后的形状。
+    // REF: https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
+    // =================================== 作业 ===================================
+    
+    return {};
+}
+````
+
+## 作业七：矩阵乘形状推导
+
+难度：⭐⭐⭐
+
+对应测例：``test_matmul``
+
+需要实现的代码块位置：`src/operators/matmul.cc`
+
+````c++
+optional<vector<Shape>> MatmulObj::inferShape(const TensorVec &inputs)
+    {
+        // =================================== 作业 ===================================
+        // TODO：返回经过 matmul 操作后的 shape
+        // REF: https://github.com/onnx/onnx/blob/main/docs/Operators.md#gemm
+        // =================================== 作业 ===================================
+        return {{}};
+    }
+````
+
+## 作业八：简单图优化规则实现
+
+难度：⭐⭐⭐⭐
+
+对应测例：``test_graph``
+
+需要实现的代码块位置：`src/core/graph.cc`
+
+````c++
+void GraphObj::optimize()
+{
+    // =================================== 作业 ===================================
+    // TODO: 设计一个算法来实现指定的图优化规则
+    // 图优化规则如下：
+    // 1. 去除冗余的算子（例如，两个相邻的算子都是 transpose 算子，且做的是相反的操作，可以将其全部删除）
+    // 2. 合并算子（例如，矩阵乘算子中含有属性transA、transB，如果其输入存在transpose，且对最后两个维度做交换，就可以将transpose融入到矩阵乘算子的属性中去）
+    // =================================== 作业 ===================================
+}
+````
\ No newline at end of file
diff --git "a/docs/\351\241\271\347\233\256\351\203\250\347\275\262.md" "b/docs/\351\241\271\347\233\256\351\203\250\347\275\262.md"
new file mode 100644
index 0000000..5690349
--- /dev/null
+++ "b/docs/\351\241\271\347\233\256\351\203\250\347\275\262.md"
@@ -0,0 +1,35 @@
+### 环境准备
+建议使用Linux系统或Mac系统，windows下使用WSL，配置方法和Linux一致。
+
+1. 安装gcc、g++，请确认版本为 11.3 及以上的稳定版本
+``` bash
+# linux 使用apt安装
+sudo apt install gcc g++
+
+# mac 使用Homebrew安装
+brew install gcc
+```
+
+2. 安装CMake，请确认版本为 3.17 及以上的稳定版本
+``` bash
+# linux 使用apt安装
+sudo apt install cmake
+
+# mac 使用Homebrew安装
+brew install cmake
+```
+
+2. 安装make
+``` bash
+# linux 使用apt安装
+sudo apt install make
+
+# mac 使用Homebrew安装
+brew install make
+```
+
+### 构建命令
+配置好上述环境后，进入项目目录后可以通过以下命令进行构建。
+- `make`/`make build`: 构建整个项目;
+- `make test-cpp`: 构建项目后执行测例;
+- `make clean`：清理生成文件
\ No newline at end of file
diff --git a/include/core/allocator.h b/include/core/allocator.h
new file mode 100644
index 0000000..002601d
--- /dev/null
+++ b/include/core/allocator.h
@@ -0,0 +1,59 @@
+#pragma once
+#include "core/runtime.h"
+#include "core/tensor.h"
+#ifdef BUILD_TEST
+#include "gtest/gtest.h"
+#endif
+#include <cstddef>
+#include <map>
+#include <unordered_set>
+
+namespace infini {
+  class Allocator
+  {
+  private:
+    Runtime runtime;
+
+    size_t used;
+
+    size_t peak;
+
+    size_t alignment;
+
+    // pointer to the memory actually allocated
+    void *ptr;
+
+    // =================================== 作业 ===================================
+    // TODO：可能需要设计一个数据结构来存储free block，以便于管理和合并
+    // HINT: 可以使用一个 map 来存储 free block，key 为 block 的起始/结尾地址，value 为 block 的大小
+    // =================================== 作业 ===================================
+
+  public:
+    Allocator(Runtime runtime);
+
+    virtual ~Allocator();
+
+    // function: simulate memory allocation
+    // arguments：
+    //     size: size of memory block to be allocated
+    // return: head address offset of the allocated memory block
+    size_t alloc(size_t size);
+
+    // function: simulate memory free
+    // arguments:
+    //     addr: head address offset of memory block to be free
+    //     size: size of memory block to be freed
+    void free(size_t addr, size_t size);
+
+    // function: perform actual memory allocation
+    // return: pointer to the head address of the allocated memory
+    void *getPtr();
+
+    void info();
+
+  private:
+    // function: memory alignment, rouned up
+    // return: size of the aligned memory block
+    size_t getAlignedSize(size_t size);
+  };
+}
diff --git a/include/core/blob.h b/include/core/blob.h
new file mode 100644
index 0000000..01684f6
--- /dev/null
+++ b/include/core/blob.h
@@ -0,0 +1,25 @@
+#pragma once
+#include "core/common.h"
+#include "core/ref.h"
+
+namespace infini {
+
+class RuntimeObj;
+using Runtime = Ref<RuntimeObj>;
+
+class BlobObj
+{
+  Runtime runtime;
+  void *ptr;
+
+public:
+  BlobObj(Runtime runtime, void *ptr) : runtime(runtime), ptr(ptr) {}
+  BlobObj(BlobObj &other) = delete;
+  BlobObj &operator=(BlobObj const &) = delete;
+  ~BlobObj() {};
+
+  template <typename T>
+  T getPtr() const { return reinterpret_cast<T>(ptr); }
+};
+
+} // namespace infini
diff --git a/include/core/common.h b/include/core/common.h
new file mode 100644
index 0000000..e4fd65b
--- /dev/null
+++ b/include/core/common.h
@@ -0,0 +1,85 @@
+#pragma once
+#include "utils/exception.h"
+#include <cassert>
+#include <functional>
+#include <iostream>
+#include <list>
+#include <map>
+#include <optional>
+#include <set>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <variant>
+#include <vector>
+
+namespace infini {
+using std::list;
+using std::map;
+using std::optional;
+using std::pair;
+using std::set;
+using std::string;
+using std::tie;
+using std::to_string;
+using std::tuple;
+using std::unordered_map;
+using std::vector;
+
+// Metaprogramming utilities
+#define _CAT(A, B) A##B
+#define _SELECT(NAME, NUM) _CAT(NAME##_, NUM)
+#define _GET_COUNT(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, COUNT, ...) COUNT
+#define _VA_SIZE(...) _GET_COUNT(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)
+#define _VA_SELECT(NAME, ...) _SELECT(NAME, _VA_SIZE(__VA_ARGS__))(__VA_ARGS__)
+
+// Assert: conditions should have no side effect
+#define _IT_ASSERT_2(condition, info)                                          \
+    static_cast<bool>(condition)                                               \
+        ? void(0)                                                              \
+        : throw ::infini::Exception(                                           \
+              std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) +   \
+              "] Assertion failed (" + #condition + "): " + info)
+#define _IT_ASSERT_1(condition) _IT_ASSERT_2(condition, "")
+#define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)
+
+#define IT_TODO_HALT() _IT_ASSERT_2(false, "Unimplemented")
+#define IT_TODO_HALT_MSG(msg) _IT_ASSERT_2(false, msg)
+#define IT_ASSERT_TODO(condition) _IT_ASSERT_2(condition, "Unimplemented")
+#define IT_TODO_SKIP() puts("Unimplemented " __FILE__ ":" __LINE__)
+
+// std::to_underlying is avaiable since C++23
+template <typename T> auto enum_to_underlying(T e) {
+    return static_cast<std::underlying_type_t<T>>(e);
+}
+
+template <typename T> std::string vecToString(const std::vector<T> &vec) {
+    std::stringstream ss;
+    ss << "[";
+    for (size_t i = 0; i < vec.size(); ++i) {
+        ss << vec.at(i);
+        if (i < vec.size() - 1) {
+            ss << ",";
+        }
+    }
+    ss << "]";
+    return ss.str();
+}
+
+template <typename T> std::string vecToString(const T *st, size_t length) {
+    std::stringstream ss;
+    ss << "[";
+    size_t i = 0;
+    for (i = 0; i < length; i++) {
+        ss << *(st + i);
+        if (i < length - 1) {
+            ss << ",";
+        }
+    }
+    ss << "]";
+    return ss.str();
+}
+
+} // namespace infini
diff --git a/include/core/data_type.h b/include/core/data_type.h
new file mode 100644
index 0000000..0b7c1fa
--- /dev/null
+++ b/include/core/data_type.h
@@ -0,0 +1,103 @@
+#pragma once
+#include "core/common.h"
+
+namespace infini {
+
+class DataType {
+  public:
+    // <https://onnx.ai/onnx/intro/concepts.html#element-type>
+    static const DataType Undefine;
+    static const DataType Float32;
+    static const DataType UInt8;
+    static const DataType Int8;
+    static const DataType UInt16;
+    static const DataType Int16;
+    static const DataType Int32;
+    static const DataType Int64;
+    static const DataType String;
+    static const DataType Bool;
+    static const DataType Float16;
+    static const DataType Double;
+    static const DataType UInt32;
+    static const DataType UInt64;
+    static const DataType BFloat16;
+    // "sizePerElement" show the DType to cpu_type
+    // DataType::Bool -> int8_t   DataType::Float16 -> uint16_t
+    static constexpr size_t sizePerElement[]{0,
+                                             sizeof(float),
+                                             sizeof(uint8_t),
+                                             sizeof(int8_t),
+                                             sizeof(uint16_t),
+                                             sizeof(int16_t),
+                                             sizeof(int32_t),
+                                             sizeof(int64_t),
+                                             sizeof(std::string),
+                                             sizeof(int8_t),
+                                             sizeof(uint16_t),
+                                             sizeof(double),
+                                             sizeof(uint32_t),
+                                             sizeof(uint64_t),
+                                             0,
+                                             0,
+                                             sizeof(uint16_t)};
+
+    static constexpr std::string_view names[]{
+        "Undefine",    "Float32", "UInt8",  "Int8",   "UInt16",
+        "Int16",       "Int32",   "Int64",  "String", "Bool",
+        "Float16",     "Double",  "UInt32", "UInt64", "PlaceHolder",
+        "PlaceHolder", "BFloat16"};
+
+    static constexpr int cpuType[]{-1, 0, 2, 3, 4, 5,  6,  7, -1,
+                                   3,  4, 9, 1, 8, -1, -1, 4};
+
+  private:
+    int index;
+
+  public:
+    // FIXME: default ctor should be deleted but json requires it. Solution:
+    // https://github.com/nlohmann/json#how-can-i-use-get-for-non-default-constructiblenon-copyable-types
+    DataType() = default;
+    constexpr DataType(int index) : index(index) {}
+    bool operator==(const DataType &rhs) const { return index == rhs.index; }
+    bool operator<(const DataType &rhs) const { return index < rhs.index; }
+
+    template <typename T> static int get() {
+        IT_TODO_HALT_MSG("Unsupported data type");
+    }
+    size_t getSize() const { return sizePerElement[index]; }
+    string toString() const { return string(names[index]); }
+    int cpuTypeInt() const { return cpuType[index]; }
+    int getIndex() const { return index; }
+};
+
+// Method definitions are out of the declaration due to GCC bug:
+// https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc
+template <> inline int DataType::get<float>() { return 0; }
+template <> inline int DataType::get<uint32_t>() { return 1; }
+template <> inline int DataType::get<uint8_t>() { return 2; }
+template <> inline int DataType::get<int8_t>() { return 3; }
+template <> inline int DataType::get<uint16_t>() { return 4; }
+template <> inline int DataType::get<int16_t>() { return 5; }
+template <> inline int DataType::get<int32_t>() { return 6; }
+template <> inline int DataType::get<int64_t>() { return 7; }
+template <> inline int DataType::get<uint64_t>() { return 8; }
+template <> inline int DataType::get<double>() { return 9; }
+
+template <int index> struct DT {};
+template <> struct DT<0> { using t = bool; };
+template <> struct DT<1> { using t = float; };
+template <> struct DT<2> { using t = uint8_t; };
+template <> struct DT<3> { using t = int8_t; };
+template <> struct DT<4> { using t = uint16_t; };
+template <> struct DT<5> { using t = int16_t; };
+template <> struct DT<6> { using t = int32_t; };
+template <> struct DT<7> { using t = int64_t; };
+template <> struct DT<8> { using t = char; };
+template <> struct DT<9> { using t = int8_t; };
+template <> struct DT<10> { using t = uint16_t; };
+template <> struct DT<11> { using t = double; };
+template <> struct DT<12> { using t = uint32_t; };
+template <> struct DT<13> { using t = uint64_t; };
+template <> struct DT<16> { using t = uint16_t; };
+
+} // namespace infini
diff --git a/include/core/graph.h b/include/core/graph.h
new file mode 100644
index 0000000..9326eaf
--- /dev/null
+++ b/include/core/graph.h
@@ -0,0 +1,119 @@
+#pragma once
+#include "core/allocator.h"
+#include "core/operator.h"
+#include "core/tensor.h"
+
+namespace infini
+{
+
+    class GraphObj : public Object
+    {
+    protected:
+        Runtime runtime;
+        TensorVec tensors;
+        OpVec ops;
+        Allocator allocator;
+
+    public:
+        explicit GraphObj(Runtime runtime)
+            : runtime(runtime), allocator(runtime), sorted(false){};
+        string toString() const override;
+        Runtime getRuntime() const { return runtime; }
+
+        Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
+        Tensor addTensor(const Tensor &tensor);
+        TensorVec addTensor(const TensorVec &tensors);
+        void removeOperator(Operator op)
+        {
+            auto it = std::find(ops.begin(), ops.end(), op);
+            if (it != ops.end())
+                ops.erase(it);
+        }
+
+        void removeTensor(Tensor tensor)
+        {
+            auto it = std::find(tensors.begin(), tensors.end(), tensor);
+            if (it != tensors.end())
+                tensors.erase(it);
+        }
+
+        const TensorVec &getTensors() const { return tensors; }
+        const OpVec &getOperators() const { return ops; }
+        Tensor getTensor(int) const;
+
+        /**
+         * @brief Sort the nodes in topological order.
+         * It returns true if the sorting is successful.
+         * Otherwise false is returned, means that there are rings in the graph,
+         * so the topological sorting fails.
+         */
+        bool topo_sort();
+
+        void optimize();
+
+        void shape_infer();
+
+        void dataMalloc();
+
+        /**
+         * @brief Add an operator and create its outputs. Output tensor arguments
+         * should be empty Refs (e.g., nullptr).
+         */
+        template <typename T, typename... Args>
+        Ref<T> addOp(Args &&...args)
+        {
+            Ref<T> op = infini::make_ref<T>(this, std::forward<Args>(args)...);
+            addOperatorAndConnect(op);
+            return op;
+        }
+
+        /**
+         * @brief Add an operator with its outputs specified.
+         */
+        template <typename T, typename... Args>
+        Ref<T> addOpWithOutputs(Args &&...args)
+        {
+            Ref<T> op = infini::make_ref<T>(nullptr, std::forward<Args>(args)...);
+            addOperatorAndConnect(op);
+            return op;
+        }
+
+        /**
+         * @brief Gets input tensors of this graph.
+         */
+        inline TensorVec getInputs() const
+        {
+            TensorVec ret;
+            for (const auto &t : tensors)
+                if (!t->getSource())
+                    ret.emplace_back(t);
+            return ret;
+        }
+
+        /**
+         * @brief Gets output tensors of this graph.
+         */
+        inline TensorVec getOutputs() const
+        {
+            TensorVec ret;
+            for (const auto &t : tensors)
+                if (t->getTargets().empty())
+                    ret.emplace_back(t);
+            return ret;
+        }
+
+        bool checkValid() const;
+
+    private:
+        /**
+         * @brief Add reverse connections and Op relationship in ctor.
+         */
+        void addOperatorAndConnect(const Operator &op);
+
+        /**
+         * @brief If the nodes is sorted in topological order.
+         */
+        bool sorted;
+    };
+
+} // namespace infini
\ No newline at end of file
diff --git a/include/core/kernel.h b/include/core/kernel.h
new file mode 100644
index 0000000..a762424
--- /dev/null
+++ b/include/core/kernel.h
@@ -0,0 +1,87 @@
+#pragma once
+#include "core/common.h"
+#include "core/operator.h"
+#include "core/tensor.h"
+#include "utils/operator_utils.h"
+#include <functional>
+
+namespace infini
+{
+
+    class RuntimeObj;
+
+    class Kernel
+    {
+    public:
+        Kernel() {}
+        virtual ~Kernel() {}
+
+        /**
+         * @brief Executes an op with a default parameter.
+         */
+        virtual void compute(const Operator &op,
+                             const RuntimeObj *context) const = 0;
+    };
+
+    class KernelRegistry
+    {
+    public:
+        using KernelRecord =
+            tuple<Kernel *const, const string, const int>; // Kernel, name, ID
+
+    private:
+        std::map<KernelAttrs, KernelRecord> kernels;
+        int nKernels = 0;
+
+    public:
+        ~KernelRegistry()
+        {
+            for (auto &[k, v] : kernels)
+                delete std::get<0>(v);
+        }
+        static KernelRegistry &getInstance()
+        {
+            static KernelRegistry instance;
+            return instance;
+        }
+        bool registerKernel(const KernelAttrs &key, Kernel *kernel, string name)
+        {
+            IT_ASSERT(kernels.find(key) == kernels.end(),
+                      "Kernel already registered");
+            kernels.emplace(key, KernelRecord{kernel, name, ++nKernels});
+            return true;
+        }
+        Kernel *getKernel(const KernelAttrs &kernelAttrs) const
+        {
+            auto it = kernels.find(kernelAttrs);
+            IT_ASSERT(it != kernels.end(), "Kernel not found for key {" +
+                                               get_kernel_attrs_str(kernelAttrs) +
+                                               "}");
+            return std::get<0>(it->second);
+        }
+        const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const
+        {
+            return kernels.at(kernelAttrs);
+        }
+    };
+
+    class CpuKernelWithoutConfig : public Kernel
+    {
+    public:
+        virtual void compute(const Operator &op,
+                             const RuntimeObj *context) const = 0;
+    };
+
+} // namespace infini
+
+#define _REGISTER_KERNEL_1(device, opType, kernel, name, cnt)                 \
+    namespace infini                                                          \
+    {                                                                         \
+        static const bool _CAT(_register_kernel_, cnt) =                      \
+            KernelRegistry::getInstance().registerKernel(KernelAttrs{device,  \
+                                                                     opType}, \
+                                                         new kernel(), name); \
+    }
+
+#define REGISTER_KERNEL(device, opType, kernel, name) \
+    _REGISTER_KERNEL_1(device, opType, kernel, name, __COUNTER__)
diff --git a/include/core/object.h b/include/core/object.h
new file mode 100644
index 0000000..2db50ad
--- /dev/null
+++ b/include/core/object.h
@@ -0,0 +1,71 @@
+#pragma once
+#include "core/common.h"
+#include "ref.h"
+
+namespace infini {
+
+using UidBaseType = int;
+
+class Uid {
+  private:
+    UidBaseType uid;
+
+  public:
+    Uid(UidBaseType uid) : uid(uid) {}
+    Uid &operator=(const Uid &rhs) = delete;
+
+    operator UidBaseType() const { return uid; }
+};
+
+class Guid : public Uid {
+  private:
+    UidBaseType generateGuid() {
+        static UidBaseType guidCnt = 0;
+        return ++guidCnt;
+    }
+
+  public:
+    Guid() : Uid(generateGuid()) {}
+    Guid(const Guid &rhs) : Uid(generateGuid()) {}
+};
+
+/**
+ * @brief Family unique ID. Cloned tensors shared the same FUID.
+ */
+class Fuid : public Uid {
+  private:
+    UidBaseType generateFuid() {
+        static UidBaseType fuidCnt = 0;
+        return ++fuidCnt;
+    }
+
+  public:
+    Fuid() : Uid(generateFuid()) {}
+    Fuid(const Fuid &fuid) : Uid(fuid) {}
+};
+
+class Object {
+  protected:
+    Guid guid;
+
+  public:
+    virtual ~Object(){};
+    virtual string toString() const = 0;
+    void print() { std::cout << toString() << std::endl; }
+    UidBaseType getGuid() const { return guid; }
+};
+
+inline std::ostream &operator<<(std::ostream &os, const Object &obj) {
+    os << obj.toString();
+    return os;
+}
+
+// Overload for Ref-wrapped Object
+template <typename T,
+          typename std::enable_if_t<std::is_base_of_v<Object, T>> * = nullptr>
+inline std::ostream &operator<<(std::ostream &os, const Ref<T> &obj) {
+    os << obj->toString();
+    return os;
+}
+
+} // namespace infini
diff --git a/include/core/op_type.h b/include/core/op_type.h
new file mode 100644
index 0000000..db67f33
--- /dev/null
+++ b/include/core/op_type.h
@@ -0,0 +1,42 @@
+﻿#pragma once
+#ifndef OP_TYPE_H
+#define OP_TYPE_H
+
+#include <string>
+#include <unordered_set>
+
+namespace infini
+{
+    struct OpType
+    {
+        using underlying_t = uint16_t;
+        enum : underlying_t
+        {
+            Unknown,
+            Add,
+            Cast,
+            Clip,
+            Concat,
+            Div,
+            Mul,
+            MatMul,
+            Relu,
+            Sub,
+            Transpose,
+
+        } type;
+
+        constexpr OpType(decltype(type) t) : type(t) {}
+        constexpr explicit OpType(underlying_t val) : type((decltype(type))val) {}
+        constexpr underlying_t underlying() const { return type; }
+
+        bool operator==(OpType others) const { return type == others.type; }
+        bool operator!=(OpType others) const { return type != others.type; }
+        bool operator<(OpType others) const { return type < others.type; }
+
+        const char *toString() const;
+    };
+
+} // namespace infini
+
+#endif // OP_TYPE_H
diff --git a/include/core/operator.h b/include/core/operator.h
new file mode 100644
index 0000000..0641007
--- /dev/null
+++ b/include/core/operator.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include "core/op_type.h"
+#include "core/tensor.h"
+
+namespace infini
+{
+    using KernelAttrs = std::tuple<Device, OpType::underlying_t>;
+
+    class GraphObj;
+    class OperatorObj : public Object
+    {
+        friend class GraphObj;
+
+    protected:
+        OpType type;
+        TensorVec inputs;
+        TensorVec outputs;
+        vector<WRef<OperatorObj>> predecessors;
+        vector<WRef<OperatorObj>> successors;
+
+    public:
+        OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs);
+        virtual optional<vector<Shape>> inferShape(const TensorVec &inputs) = 0;
+        virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
+        /**
+         * @brief Constructs outputs (if requried) and check whether the operator is
+         * valid.
+         *
+         * @param graph If graph is not nullptr, outputs should be created in this
+         * function.
+         */
+        bool checkValid(GraphObj *graph);
+
+    public: // getter and setter
+        const TensorVec &getInputs() const { return inputs; }
+        const TensorVec &getOutputs() const { return outputs; }
+        Tensor getInputs(size_t i) const { return inputs.at(i); }
+        Tensor getOutput() const
+        {
+            IT_ASSERT(outputs.size() == 1, "Unimplemented");
+            return outputs[0];
+        }
+        Tensor getOutput(size_t i) const
+        {
+            IT_ASSERT(i < outputs.size(), "Index exceeded");
+            return outputs.at(i);
+        }
+        OpVec getPredecessors() const { return wrefs_to_refs(predecessors); }
+        OpVec getSuccessors() const { return wrefs_to_refs(successors); }
+        OpType getOpType() const { return type; }
+        // HACK: set correct data type
+        DataType getDType() const { return getInputs(0)->getDType(); }
+        DataType getOutDType() const { return getOutput()->getDType(); }
+        virtual int numInputs() const = 0;
+        virtual int numOutputs() const = 0;
+
+        /**
+         * @brief Clone this operator and replace its inputs and outputs.
+         *
+         * @param newInputs
+         * @param newOutputs
+         * @return Operator
+         */
+        virtual Operator clone(const TensorVec &newInputs,
+                               const TensorVec &newOutputs) const = 0;
+
+    protected:
+        optional<vector<Shape>> inferShape();
+        vector<DataType> inferDataType() const;
+
+    private:
+        void addPredecessors(const Operator &op) { predecessors.emplace_back(op); }
+        void addSuccessors(const Operator &op) { successors.emplace_back(op); }
+        void removePredecessors(const Operator &op);
+        void removeSuccessors(const Operator &op);
+        void replaceInput(Tensor t1, Tensor t2);
+    };
+
+#define OP_CLONE(OpObj)                                                \
+    virtual Operator clone(const TensorVec &newInputs,                 \
+                           const TensorVec &newOutputs) const override \
+    {                                                                  \
+        auto op = infini::make_ref<OpObj>(*this);                      \
+        op->inputs = newInputs;                                        \
+        op->outputs = newOutputs;                                      \
+        op->predecessors.clear();                                      \
+        op->successors.clear();                                        \
+        IT_ASSERT(op->checkValid(nullptr));                            \
+        return op;                                                     \
+    }
+
+} // namespace infini
diff --git a/include/core/ref.h b/include/core/ref.h
new file mode 100644
index 0000000..3393f6e
--- /dev/null
+++ b/include/core/ref.h
@@ -0,0 +1,43 @@
+#pragma once
+#include "core/common.h"
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+namespace infini {
+
+template <typename T> using Ref = std::shared_ptr<T>;
+template <typename T> using WRef = std::weak_ptr<T>;
+
+template <typename T> struct is_ref : std::false_type {};
+template <typename T> struct is_ref<Ref<T>> : std::true_type {};
+template <typename T> struct is_ref<WRef<T>> : std::true_type {};
+
+template <typename T, typename... Params> Ref<T> make_ref(Params &&...params) {
+    static_assert(is_ref<T>::value == false, "Ref should not be nested");
+    return std::make_shared<T>(std::forward<Params>(params)...);
+}
+
+template <class T, class U,
+          typename std::enable_if_t<std::is_base_of_v<U, T>> * = nullptr>
+Ref<T> as(const Ref<U> &ref) {
+    return std::dynamic_pointer_cast<T>(ref);
+}
+
+template <typename T>
+std::vector<WRef<T>> refs_to_wrefs(const std::vector<Ref<T>> &refs) {
+    std::vector<WRef<T>> wrefs;
+    for (const auto &ref : refs)
+        wrefs.emplace_back(ref);
+    return wrefs;
+}
+
+template <typename T>
+std::vector<Ref<T>> wrefs_to_refs(const std::vector<WRef<T>> &wrefs) {
+    std::vector<Ref<T>> refs;
+    for (const auto &wref : wrefs)
+        refs.emplace_back(wref);
+    return refs;
+}
+
+} // namespace infini
diff --git a/include/core/runtime.h b/include/core/runtime.h
new file mode 100644
index 0000000..1b64cd9
--- /dev/null
+++ b/include/core/runtime.h
@@ -0,0 +1,69 @@
+#pragma once
+#include "core/common.h"
+#include "core/op_type.h"
+#include "core/ref.h"
+
+namespace infini
+{
+  class TensorObj;
+  class OperatorObj;
+  class GraphObj;
+  class RuntimeObj;
+  class BlobObj;
+
+  using Tensor = Ref<TensorObj>;
+  using Operator = Ref<OperatorObj>;
+  using Graph = Ref<GraphObj>;
+  using Runtime = Ref<RuntimeObj>;
+  using Blob = Ref<BlobObj>;
+
+  using TensorVec = vector<Tensor>;
+  using OpVec = vector<Operator>;
+
+  enum class Device
+  {
+    CPU = 1
+  };
+
+  class RuntimeObj : public std::enable_shared_from_this<RuntimeObj>
+  {
+  protected:
+    Device device;
+
+  public:
+    explicit RuntimeObj(Device device)
+        : device(device) {}
+    RuntimeObj(RuntimeObj &other) = delete;
+    RuntimeObj &operator=(RuntimeObj const &) = delete;
+    virtual ~RuntimeObj() {}
+
+    virtual void run(const Graph &graph) const = 0;
+    virtual void *alloc(size_t size) = 0;
+    virtual void dealloc(void *ptr) = 0;
+
+    bool isCpu() const
+    {
+      return true;
+    }
+
+    virtual string toString() const = 0;
+  };
+
+  class NativeCpuRuntimeObj : public RuntimeObj
+  {
+  public:
+    NativeCpuRuntimeObj() : RuntimeObj(Device::CPU) {}
+
+    static Ref<NativeCpuRuntimeObj> &getInstance()
+    {
+      static Ref<NativeCpuRuntimeObj> instance =
+          make_ref<NativeCpuRuntimeObj>();
+      return instance;
+    }
+    void dealloc(void *ptr) override;
+    void run(const Graph &graph) const override;
+    void *alloc(size_t size) override;
+    string toString() const override;
+  };
+
+} // namespace infini
diff --git a/include/core/tensor.h b/include/core/tensor.h
new file mode 100644
index 0000000..93eec14
--- /dev/null
+++ b/include/core/tensor.h
@@ -0,0 +1,164 @@
+#pragma once
+#include "core/blob.h"
+#include "core/data_type.h"
+#include "core/object.h"
+#include "core/runtime.h"
+#include <cmath>
+#include <cstring>
+#include <fstream>
+
+namespace infini
+{
+    class GraphObj;
+    using ShapeElem = int;
+    using Shape = vector<ShapeElem>;
+    class TensorObj : public Object
+    {
+        friend class GraphObj;
+
+    protected:
+        int dim;
+
+        DataType dtype;
+        vector<WRef<OperatorObj>> targets;
+        WRef<OperatorObj> source;
+        Blob data;
+        Runtime runtime;
+
+    private:
+        Shape shape;
+        size_t _size; // Cache of Π(shape).
+        Fuid fuid;    // Cloned tensors share the same id. Tensors constructed from
+                      // scratch have a new id.
+
+    public:
+        TensorObj(Shape shape, DataType dtype, Runtime runtime);
+        virtual ~TensorObj() {}
+        string toString() const override;
+
+        size_t size() const { return _size; }
+        size_t getBytes() const { return _size * dtype.getSize(); }
+
+        Shape getDims() const { return shape; }
+        void setShape(Shape shape_);
+        size_t getRank() const { return shape.size(); }
+        UidBaseType getFuid() const { return fuid; }
+
+        void setData(
+            std::function<void(void *, size_t, DataType)> const &generator) const;
+
+        void setDataBlob(const Blob &blob);
+
+        void printData() const;
+        bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;
+
+        template <typename T>
+        bool equalData(const vector<T> &dataVector)
+        {
+            IT_ASSERT(size() == dataVector.size());
+            IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
+            return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
+        }
+
+        template <typename T>
+        T getRawDataPtr() const
+        {
+            static_assert(std::is_pointer_v<T>,
+                          "Raw data pointer has a type of pointer");
+            IT_ASSERT(data != nullptr);
+            return data->getPtr<T>();
+        }
+
+        DataType getDType() const { return dtype; }
+        Runtime getRuntime() const { return runtime; }
+
+        OpVec getTargets() const { return wrefs_to_refs(targets); }
+        Operator getSource() const { return source.lock(); }
+
+    private:
+        template <class T>
+        string dataToString() const
+        {
+            std::stringstream builder;
+            builder << "Tensor: " << guid << std::endl;
+
+            auto numDims = shape.size();
+            auto dimSzVec = vector<int>(numDims, 1);
+            auto ptr = data->getPtr<T *>();
+            dimSzVec[numDims - 1] = shape[numDims - 1];
+
+            for (int i = numDims - 1; i != 0; --i)
+                dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
+
+            for (size_t i = 0, iEnd = size(); i < iEnd; ++i)
+            {
+                for (size_t j = 0; j < numDims; ++j)
+                    if (i % dimSzVec[j] == 0)
+                        builder << "[";
+
+                builder << ptr[i];
+                for (size_t j = 0; j < numDims; ++j)
+                    if ((int)i % dimSzVec[j] == dimSzVec[j] - 1)
+                        builder << "]";
+
+                if (i != size() - 1)
+                    builder << ", ";
+
+                auto column = (size_t)dimSzVec[numDims - 1];
+                if (i % column == column - 1)
+                    builder << std::endl;
+            }
+            return builder.str();
+        }
+
+        template <typename T>
+        bool equalDataImpl(const T *a, const T *b, size_t size,
+                           double relativeError = 1e-6) const
+        {
+            for (size_t i = 0; i < size; ++i)
+            {
+                if constexpr (std::is_integral_v<T>)
+                {
+                    if (a[i] != b[i])
+                        return false;
+                }
+                else if constexpr (std::is_floating_point_v<T>)
+                {
+                    if (std::min(fabs(a[i]), fabs(b[i])) == 0. &&
+                        fabs(a[i] - b[i]) > relativeError)
+                    {
+                        printf("Error on %lu: %f %f\n", i, a[i], b[i]);
+                        return false;
+                    }
+                    else if (std::min(fabs(a[i]), fabs(b[i])) != 0. &&
+                             fabs(a[i] - b[i]) /
+                                     std::max(fabs(a[i]), fabs(b[i])) >
+                                 relativeError)
+                    {
+                        printf("Error on %lu: %f %f\n", i, a[i], b[i]);
+                        return false;
+                    }
+                }
+                else
+                {
+                    static_assert(!sizeof(T), "Unsupported data type");
+                }
+            }
+            return true;
+        }
+
+        void addTarget(const Operator &op) { targets.emplace_back(op); }
+        void setSource(const Operator &op) { source = op; }
+        void removeTarget(const Operator &op)
+        {
+            for (auto itr = targets.begin(); itr != targets.end();)
+            {
+                if (itr->lock() == op)
+                    itr = targets.erase(itr);
+                else
+                    ++itr;
+            }
+        }
+    };
+
+} // namespace infini
diff --git a/include/operators/concat.h b/include/operators/concat.h
new file mode 100644
index 0000000..86287fd
--- /dev/null
+++ b/include/operators/concat.h
@@ -0,0 +1,32 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+/**
+ * @brief Concatenate several tensors into one. All the input tensors should
+ * have the same shape except for the concatenated dimension.
+ *
+ */
+class ConcatObj : public OperatorObj {
+    int dim;
+
+  public:
+    /**
+     * @brief Construct a new Concat object.
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param inputs The input tensors to be concatenated.
+     * @param output Concatenated tensor.
+     * @param dim The dimension to concatenate on.
+     */
+    ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int dim);
+    OP_CLONE(ConcatObj);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return inputs.size(); }
+    int numOutputs() const override { return 1; }
+    int getDim() const { return dim; }
+};
+} // namespace infini
diff --git a/include/operators/element_wise.h b/include/operators/element_wise.h
new file mode 100644
index 0000000..4260b2d
--- /dev/null
+++ b/include/operators/element_wise.h
@@ -0,0 +1,47 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini
+{
+  /**
+   * @brief Base class of **binary** element-wise operators.
+   * Unary operators like activations are not the derived classes of
+   * ElementWiseObj.
+   *
+   */
+  class ElementWiseObj : public OperatorObj
+  {
+  public:
+    /**
+     * @brief Construct a new ElementWise object
+     *
+     * @param type Operator type.
+     * @param graph The computation graph that this operator belongs to.
+     * @param input0 The first input tensor.
+     * @param input1 The second input tensor.
+     * @param output The output tensor.
+     */
+    ElementWiseObj(OpType type, GraphObj *graph, Tensor input0, Tensor input1,
+                   Tensor output);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 2; }
+    int numOutputs() const override { return 1; }
+    };
+
+#define DEFINE_ELEMENT_WISE_OBJ(prefix, type)                    \
+  class prefix##Obj : public ElementWiseObj                      \
+  {                                                              \
+  public:                                                        \
+    prefix##Obj(GraphObj *graph, Tensor input0, Tensor input1,   \
+                Tensor output)                                   \
+        : ElementWiseObj(type, graph, input0, input1, output) {} \
+    OP_CLONE(prefix##Obj);                                       \
+  };
+
+  DEFINE_ELEMENT_WISE_OBJ(Add, OpType::Add)
+  DEFINE_ELEMENT_WISE_OBJ(Sub, OpType::Sub)
+  DEFINE_ELEMENT_WISE_OBJ(Mul, OpType::Mul)
+  DEFINE_ELEMENT_WISE_OBJ(Div, OpType::Div)
+}; // namespace infini
diff --git a/include/operators/matmul.h b/include/operators/matmul.h
new file mode 100644
index 0000000..4925895
--- /dev/null
+++ b/include/operators/matmul.h
@@ -0,0 +1,60 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini
+{
+    /**
+     * @brief Matrix multiplication.
+     *
+     */
+    class MatmulObj : public OperatorObj
+    {
+    private:
+        // InfiniTensor assumes a row-major tensor layout. `transA`=false means
+        // default dims, true means A should be transposed before matmul. This is in
+        // oppsite to the column-major BLAS.
+        bool transA, transB;
+
+        // Auxiliary attributes which are not a part of operator attributes.
+        int m, n, k;
+
+    public:
+        /**
+         * @brief Matmul operator with batch broadcast and tensor transpose
+         * supports. Only one tensor with singe batch can be broadcasted due to the
+         * BLAS interface restriction. Tranpose indicates whether the last two
+         * dimensions should be transposed before Matmul and does not affect other
+         * leading dimensions.
+         *
+         * Matmul show how operators are defined in InfiniTensor. The constructor of
+         * an operator can create output tensors for the operator or not, which
+         * depends on `graph`.
+         *
+         * @param graph The computation graph that this operator belongs to.
+         * @param A The input tensor.
+         * @param B The input tensor.
+         * @param C C is the output of Matmul. If outputs are going to be created in
+         * the constructor, C should be an empty Ref.
+         * @param transA If matrix A should be transposed when computing.
+         * @param transB If matrix B should be transposed when computing.
+         */
+        MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C,
+                  bool transA = false, bool transB = false);
+        OP_CLONE(MatmulObj);
+
+        std::string toString() const override;
+        optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+        int numInputs() const override { return inputs.size(); }
+        int numOutputs() const override { return 1; }
+
+        bool getTransA() const { return transA; }
+        bool getTransB() const { return transB; }
+        void setTransA(bool transA) { this->transA = transA; }
+        void setTransB(bool transB) { this->transB = transB; }
+        int getM() const { return m; }
+        int getN() const { return n; }
+        int getK() const { return k; }
+    };
+
+} // namespace infini
\ No newline at end of file
diff --git a/include/operators/transpose.h b/include/operators/transpose.h
new file mode 100644
index 0000000..c32bbe5
--- /dev/null
+++ b/include/operators/transpose.h
@@ -0,0 +1,34 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini
+{
+  /**
+   * @brief Transpose the input tensor similar to numpy.transpose.
+   *
+   */
+  class TransposeObj : public OperatorObj
+  {
+  public:
+    /**
+     * @brief Construct a new TransposeObj object.
+     *
+     * @param graph The graph to which this operator belongs.
+     * @param input The input tensor.
+     * @param output The output tensor.
+     * @param permute The permutation of the dimensions.
+     */
+    TransposeObj(GraphObj *graph, Tensor input, Tensor output,
+                 vector<int> permute);
+    OP_CLONE(TransposeObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+    std::vector<int> getPermute() const { return transposePermute; }
+
+  private:
+    vector<int> transposePermute;
+  };
+} // namespace infini
diff --git a/include/operators/unary.h b/include/operators/unary.h
new file mode 100644
index 0000000..83f3dd3
--- /dev/null
+++ b/include/operators/unary.h
@@ -0,0 +1,104 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini
+{
+  /**
+   * @brief The base class for unary operators.
+   *
+   */
+  class UnaryObj : public OperatorObj
+  {
+  public:
+    /**
+     * @brief Construct a new Unary object.
+     *
+     * @param type Operator type.
+     * @param graph The computation graph that this operator belongs to.
+     * @param input The input tensor.
+     * @param output The output tensor.
+     */
+    UnaryObj(OpType type, GraphObj *graph, Tensor input, Tensor output);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+  };
+
+  class ClipObj : public OperatorObj
+  {
+  public:
+    ClipObj(GraphObj *graph, Tensor input, Tensor output,
+            std::optional<float> min, std::optional<float> max);
+    OP_CLONE(ClipObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    std::optional<float> getMin() const { return minValue; };
+    std::optional<float> getMax() const { return maxValue; };
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    std::optional<float> minValue, maxValue;
+  };
+
+  enum class CastType
+  {
+    Float2Float16 = 0,
+    Float2Int64,
+    Float2Int32,
+    Float2Int16,
+    Float2Int8,
+    Float2BFloat16,
+    Int322Float,
+    Int322Int8,
+    Int322Int16,
+    Int322Int64,
+    Int162Float,
+    Int162Int32,
+    Int82Float,
+    Int82Int16,
+    Int82Int32,
+    Uint82Float,
+    Uint82Int32,
+    Uint82Int64,
+    Int642Int32,
+    Int642Uint32,
+    Int642Float,
+    Uint322Int64,
+    Float162Float,
+    BFloat162Float,
+    Float2Float,
+  };
+
+  class CastObj : public OperatorObj
+  {
+  public:
+    CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type);
+    OP_CLONE(CastObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    vector<DataType> inferDataType(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    CastType getType() const { return castType; }
+    DataType getOutputDataType() const;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    CastType castType;
+  };
+
+#define DEFINE_UNARY_OBJ(prefix, type)                        \
+  class prefix##Obj : public UnaryObj                         \
+  {                                                           \
+  public:                                                     \
+    prefix##Obj(GraphObj *graph, Tensor input, Tensor output) \
+        : UnaryObj(type, graph, input, output) {}             \
+    OP_CLONE(prefix##Obj);                                    \
+  };
+
+  DEFINE_UNARY_OBJ(Relu, OpType::Relu)
+}; // namespace infini
diff --git a/include/test.h b/include/test.h
new file mode 100644
index 0000000..cef5a3e
--- /dev/null
+++ b/include/test.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "core/common.h"
+#include "utils/data_generator.h"
+#include "gtest/gtest.h"
diff --git a/include/utils/data_generator.h b/include/utils/data_generator.h
new file mode 100644
index 0000000..1b7d91a
--- /dev/null
+++ b/include/utils/data_generator.h
@@ -0,0 +1,59 @@
+#pragma once
+#include "core/common.h"
+#include <random>
+
+namespace infini {
+
+class DataGenerator {
+  private:
+    virtual void fill(uint32_t *data, size_t size) { IT_TODO_HALT(); }
+    virtual void fill(float *data, size_t size) { IT_TODO_HALT(); }
+
+public:
+    virtual ~DataGenerator() {}
+    void operator()(void *data, size_t size, DataType dataType) {
+        if (dataType == DataType::UInt32)
+            fill(reinterpret_cast<uint32_t *>(data), size);
+        else if (dataType == DataType::Float32)
+            fill(reinterpret_cast<float *>(data), size);
+        else
+            IT_TODO_HALT();
+    }
+};
+
+class IncrementalGenerator : public DataGenerator {
+  public:
+    virtual ~IncrementalGenerator() {}
+
+  private:
+    template <typename T> void fill(T *data, size_t size) {
+        for (size_t i = 0; i < size; i++) {
+            data[i] = i;
+        }
+    }
+
+    void fill(uint32_t *data, size_t size) override {
+        fill<uint32_t>(data, size);
+    }
+    void fill(float *data, size_t size) override { fill<float>(data, size); }
+};
+
+template <int val> class ValGenerator : public DataGenerator {
+  public:
+    virtual ~ValGenerator() {}
+
+  private:
+    template <typename T> void fill(T *data, size_t size) {
+        for (size_t i = 0; i < size; i++) {
+            data[i] = val;
+        }
+    }
+
+    void fill(uint32_t *data, size_t size) override {
+        fill<uint32_t>(data, size);
+    }
+    void fill(float *data, size_t size) override { fill<float>(data, size); }
+};
+typedef ValGenerator<1> OneGenerator;
+typedef ValGenerator<0> ZeroGenerator;
+} // namespace infini
diff --git a/include/utils/exception.h b/include/utils/exception.h
new file mode 100644
index 0000000..d7bb433
--- /dev/null
+++ b/include/utils/exception.h
@@ -0,0 +1,22 @@
+#pragma once
+#include <stdexcept>
+#include <string>
+
+namespace infini {
+
+class Exception : public std::runtime_error {
+  protected:
+    std::string info;
+
+  public:
+    Exception(const std::string &msg);
+
+    Exception &operator<<(const std::string &str) {
+        info += str;
+        return *this;
+    }
+
+    const char *what() const noexcept override { return info.c_str(); }
+};
+
+} // namespace infini
diff --git a/include/utils/operator_utils.h b/include/utils/operator_utils.h
new file mode 100644
index 0000000..e3a2373
--- /dev/null
+++ b/include/utils/operator_utils.h
@@ -0,0 +1,26 @@
+#pragma once
+#ifndef OPERATOR_UTIL_H
+#define OPERATOR_UTIL_H
+
+#include "core/operator.h"
+#include "core/tensor.h"
+
+#include <numeric>
+
+namespace infini {
+
+// Launch a broadcast shape based on the shape of input A and B
+Shape infer_broadcast(const Shape &A, const Shape &B);
+// Launch the real axis based on rank and current axis
+int get_real_axis(const int &axis, const int &rank);
+// Locate the index with size from Shape
+Shape locate_index(size_t inputN, const Shape &shape);
+// Delocate the ShapeIndex from Shape with broadcast
+size_t delocate_index(const Shape &shapeIndex, const Shape &shape,
+                      const Shape &stride);
+// Convert KernelAttrs to a string representation
+std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs);
+
+} // namespace infini
+
+#endif
diff --git a/src/core/allocator.cc b/src/core/allocator.cc
new file mode 100644
index 0000000..ff593ae
--- /dev/null
+++ b/src/core/allocator.cc
@@ -0,0 +1,69 @@
+#include "core/allocator.h"
+#include <utility>
+
+namespace infini
+{
+    Allocator::Allocator(Runtime runtime) : runtime(runtime)
+    {
+        used = 0;
+        peak = 0;
+        ptr = nullptr;
+
+        // 'alignment' defaults to sizeof(uint64_t), because it is the length of
+        // the longest data type currently supported by the DataType field of
+        // the tensor
+        alignment = sizeof(uint64_t);
+    }
+
+    Allocator::~Allocator()
+    {
+        if (this->ptr != nullptr)
+        {
+            runtime->dealloc(this->ptr);
+        }
+    }
+
+    size_t Allocator::alloc(size_t size)
+    {
+        IT_ASSERT(this->ptr == nullptr);
+        // pad the size to the multiple of alignment
+        size = this->getAlignedSize(size);
+
+        // =================================== 作业 ===================================
+        // TODO: 设计一个算法来分配内存，返回起始地址偏移量
+        // =================================== 作业 ===================================
+
+        return 0;
+    }
+
+    void Allocator::free(size_t addr, size_t size)
+    {
+        IT_ASSERT(this->ptr == nullptr);
+        size = getAlignedSize(size);
+
+        // =================================== 作业 ===================================
+        // TODO: 设计一个算法来回收内存
+        // =================================== 作业 ===================================
+    }
+
+    void *Allocator::getPtr()
+    {
+        if (this->ptr == nullptr)
+        {
+            this->ptr = runtime->alloc(this->peak);
+            printf("Allocator really alloc: %p %lu bytes\n", this->ptr, peak);
+        }
+        return this->ptr;
+    }
+
+    size_t Allocator::getAlignedSize(size_t size)
+    {
+        return ((size - 1) / this->alignment + 1) * this->alignment;
+    }
+
+    void Allocator::info()
+    {
+        std::cout << "Used memory: " << this->used
+                  << ", peak memory: " << this->peak << std::endl;
+    }
+}
diff --git a/src/core/data_type.cc b/src/core/data_type.cc
new file mode 100644
index 0000000..3825c9c
--- /dev/null
+++ b/src/core/data_type.cc
@@ -0,0 +1,22 @@
+#include "core/data_type.h"
+
+namespace infini {
+// Move implementation here to avoid compile time error on some platform
+// to be consistent with onnx
+// https://github.com/onnx/onnx/blob/aeb21329122b96df1d3ef33b500a35ca140b1431/onnx/onnx.proto#L484
+const DataType DataType::Undefine(0);
+const DataType DataType::Float32(1);
+const DataType DataType::UInt8(2);
+const DataType DataType::Int8(3);
+const DataType DataType::UInt16(4);
+const DataType DataType::Int16(5);
+const DataType DataType::Int32(6);
+const DataType DataType::Int64(7);
+const DataType DataType::String(8);
+const DataType DataType::Bool(9);
+const DataType DataType::Float16(10);
+const DataType DataType::Double(11);
+const DataType DataType::UInt32(12);
+const DataType DataType::UInt64(13);
+const DataType DataType::BFloat16(16);
+} // namespace infini
diff --git a/src/core/graph.cc b/src/core/graph.cc
new file mode 100644
index 0000000..3a90637
--- /dev/null
+++ b/src/core/graph.cc
@@ -0,0 +1,230 @@
+#include "core/graph.h"
+#include <algorithm>
+#include <numeric>
+#include <queue>
+
+namespace infini
+{
+
+    void GraphObj::addOperatorAndConnect(const Operator &op)
+    {
+        sorted = false;
+        ops.push_back(op);
+        for (auto &input : op->getInputs())
+        {
+            if (input)
+            {
+                input->addTarget(op);
+                if (auto pred = input->getSource())
+                {
+                    pred->addSuccessors(op);
+                    op->addPredecessors(pred);
+                }
+            }
+        }
+        for (auto &output : op->getOutputs())
+        {
+            if (output)
+            {
+                output->setSource(op);
+                for (auto &succ : output->getTargets())
+                {
+                    succ->addPredecessors(op);
+                    op->addSuccessors(succ);
+                }
+            }
+        }
+    }
+
+    string GraphObj::toString() const
+    {
+        std::ostringstream oss;
+        oss << "Graph Tensors:\n";
+        for (const auto &tensor : tensors)
+            oss << tensor << "\n";
+
+        oss << "Graph operators:\n";
+        for (const auto &op : ops)
+        {
+            vector<UidBaseType> preds, succs;
+            for (auto &o : op->getPredecessors())
+                preds.emplace_back(o->getGuid());
+            for (auto &o : op->getSuccessors())
+                succs.emplace_back(o->getGuid());
+            oss << "OP " << op->getGuid();
+            oss << ", pred " << vecToString(preds);
+            oss << ", succ " << vecToString(succs);
+            oss << ", " << op << "\n";
+        }
+        return oss.str();
+    }
+
+    bool GraphObj::topo_sort()
+    {
+        if (this->sorted)
+        {
+            return true;
+        }
+        std::vector<Operator> sorted;
+        std::unordered_set<OperatorObj *> flags;
+        sorted.reserve(ops.size());
+        flags.reserve(ops.size());
+        while (sorted.size() < ops.size())
+        {
+            // Any node is move to sorted in this loop.
+            auto modified = false;
+            for (auto const &op : ops)
+            {
+                if (auto const &inputs = op->getInputs();
+                    flags.find(op.get()) == flags.end() &&
+                    std::all_of(inputs.begin(), inputs.end(),
+                                [&flags](auto const &input)
+                                {
+                                    auto ptr = input->getSource().get();
+                                    return !ptr || flags.find(ptr) != flags.end();
+                                }))
+                {
+                    modified = true;
+                    sorted.emplace_back(op);
+                    flags.insert(op.get());
+                }
+            }
+            if (!modified)
+            {
+                return false;
+            }
+        }
+        this->ops = std::move(sorted);
+        return this->sorted = true;
+    }
+
+    void GraphObj::optimize()
+    {
+        // =================================== 作业 ===================================
+        // TODO: 设计一个算法来实现指定的图优化规则
+        // 图优化规则如下：
+        // 1. 去除冗余的算子（例如，两个相邻的算子都是 transpose 算子，且做的是相反的操作，可以将其全部删除）
+        // 2. 合并算子（例如，矩阵乘算子中含有属性transA、transB，如果其输入存在transpose，且对最后两个维度做交换，就可以将transpose融入到矩阵乘算子的属性中去）
+        // =================================== 作业 ===================================
+    }
+
+    Tensor GraphObj::getTensor(int fuid) const
+    {
+        for (auto tensor : tensors)
+        {
+            if (tensor->getFuid() == fuid)
+            {
+                return tensor;
+            }
+        }
+        return nullptr;
+    }
+
+    void GraphObj::shape_infer()
+    {
+        for (auto &op : ops)
+        {
+            auto ans = op->inferShape();
+            IT_ASSERT(ans.has_value());
+            auto oldOutputs = op->getOutputs();
+            IT_ASSERT(ans.value().size() == oldOutputs.size());
+            // replace the old outputshape and size with new one
+            for (int i = 0; i < (int)ans.value().size(); ++i)
+            {
+                auto newShape = ans.value()[i];
+                auto oldShape = oldOutputs[i]->getDims();
+                auto fuid = oldOutputs[i]->getFuid();
+                if (newShape != oldShape)
+                {
+                    auto tensor = this->getTensor(fuid);
+                    tensor->setShape(newShape);
+                }
+            }
+        }
+    }
+
+    void GraphObj::dataMalloc()
+    {
+        // topological sorting first
+        IT_ASSERT(topo_sort() == true);
+
+        // =================================== 作业 ===================================
+        // TODO：利用 allocator 给计算图分配内存
+        // HINT: 获取分配好的内存指针后，可以调用 tensor 的 setDataBlob 函数给 tensor 绑定内存
+        // =================================== 作业 ===================================
+
+        allocator.info();
+    }
+
+    Tensor GraphObj::addTensor(Shape dim, DataType dtype)
+    {
+        return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
+    }
+
+    Tensor GraphObj::addTensor(const Tensor &tensor)
+    {
+        IT_ASSERT(tensor->getRuntime() == runtime,
+                  std::string("Tensor runtime mismatch: cannot add a tenosr in ") +
+                      tensor->getRuntime()->toString() + " to " +
+                      runtime->toString());
+        tensors.emplace_back(tensor);
+        return tensor;
+    }
+
+    TensorVec GraphObj::addTensor(const TensorVec &tensors)
+    {
+        for (auto &t : tensors)
+            addTensor(t);
+        return tensors;
+    }
+
+    // tensor's "source" and "target" must be in "ops".
+    // tensor has no "source" and no "target" must not exist.
+    // "inputs" or "outputs" of operators must be in "tensors"
+    // "predecessors" and "successors" of an operator of "ops" must be in "ops".
+    bool GraphObj::checkValid() const
+    {
+        for (auto tensor : tensors)
+        {
+            IT_ASSERT(!(tensor->getTargets().size() == 0 &&
+                        nullptr == tensor->getSource()));
+            for (auto op : tensor->getTargets())
+            {
+                IT_ASSERT(std::find(ops.begin(), ops.end(), op) != ops.end());
+            }
+            auto op = tensor->getSource();
+            IT_ASSERT(!(op && std::find(ops.begin(), ops.end(), op) == ops.end()));
+        }
+        for (auto op : ops)
+        {
+            for (auto tensor : op->getInputs())
+            {
+                IT_ASSERT(std::find(tensors.begin(), tensors.end(), tensor) !=
+                          tensors.end());
+            }
+            for (auto tensor : op->getOutputs())
+            {
+                IT_ASSERT(std::find(tensors.begin(), tensors.end(), tensor) !=
+                          tensors.end());
+            }
+            for (auto pre : op->getPredecessors())
+            {
+                IT_ASSERT(std::find(ops.begin(), ops.end(), pre) != ops.end());
+            }
+            for (auto suc : op->getSuccessors())
+            {
+                IT_ASSERT(std::find(ops.begin(), ops.end(), suc) != ops.end());
+            }
+        }
+        std::set<UidBaseType> s;
+        // check whether two tensors with the same FUID exist
+        for (auto tensor : tensors)
+        {
+            int cnt = s.count(tensor->getFuid());
+            IT_ASSERT(cnt == 0, std::to_string(tensor->getFuid()));
+            s.insert(tensor->getFuid());
+        }
+        return true;
+    }
+
+} // namespace infini
\ No newline at end of file
diff --git a/src/core/op_type.cc b/src/core/op_type.cc
new file mode 100644
index 0000000..b2a721a
--- /dev/null
+++ b/src/core/op_type.cc
@@ -0,0 +1,32 @@
+﻿#include "core/op_type.h"
+
+namespace infini
+{
+    const char *OpType::toString() const
+    {
+#define CASE(NAME)     \
+    case OpType::NAME: \
+        return #NAME
+
+        switch (type)
+        {
+            CASE(Unknown);
+            CASE(Add);
+            CASE(Sub);
+            CASE(Mul);
+            CASE(Div);
+            CASE(Cast);
+            CASE(Clip);
+            CASE(Relu);
+            CASE(Transpose);
+            CASE(Concat);
+            CASE(MatMul);
+
+        default:
+            return "Unknown";
+        }
+
+#undef CASE
+    }
+
+} // namespace infini
diff --git a/src/core/operator.cc b/src/core/operator.cc
new file mode 100644
index 0000000..a70ca48
--- /dev/null
+++ b/src/core/operator.cc
@@ -0,0 +1,85 @@
+#include "core/operator.h"
+#include "core/graph.h"
+
+namespace infini
+{
+
+    OperatorObj::OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs)
+        : type(opType), inputs(inputs), outputs(outputs) {}
+
+    void OperatorObj::removePredecessors(const Operator &op)
+    {
+        for (auto it = predecessors.begin(); it != predecessors.end();)
+        {
+            if (it->lock() == op)
+                it = predecessors.erase(it);
+            else
+                ++it;
+        }
+    }
+
+    void OperatorObj::removeSuccessors(const Operator &op)
+    {
+        for (auto it = successors.begin(); it != successors.end();)
+        {
+            if (it->lock() == op)
+                it = successors.erase(it);
+            else
+                ++it;
+        }
+    }
+
+    void OperatorObj::replaceInput(Tensor t1, Tensor t2)
+    {
+        for (auto itr = inputs.begin(); itr != inputs.end(); ++itr)
+        {
+            if (*itr == t1)
+            {
+                *itr = t2;
+            }
+        }
+    }
+
+    bool OperatorObj::checkValid(GraphObj *graph)
+    {
+        auto optShapes = inferShape();
+        if (!optShapes) // shape inference failed
+            return false;
+
+        const vector<Shape> &shapes = *optShapes;
+        if (shapes.size() != outputs.size())
+            return false;
+        if (graph)
+        { // if graph != nullptr, outputs should be created
+            auto dataTypes = inferDataType();
+            for (size_t i = 0; i < outputs.size(); i++)
+            {
+                IT_ASSERT(!outputs[i], "Find empty output while operator creation");
+                outputs[i] = graph->addTensor(shapes[i], dataTypes[i]);
+            }
+        }
+        else
+        { // if outputs have been created, check their shapes
+            for (size_t i = 0; i < shapes.size(); ++i)
+            {
+                if (shapes[i] != outputs[i]->getDims())
+                    return false;
+            }
+        }
+        return true;
+    }
+
+    optional<vector<Shape>> OperatorObj::inferShape() { return inferShape(inputs); }
+
+    vector<DataType> OperatorObj::inferDataType(const TensorVec &inputs) const
+    {
+        auto dataType = inputs[0]->getDType();
+        return vector(numOutputs(), dataType);
+    }
+
+    vector<DataType> OperatorObj::inferDataType() const
+    {
+        return inferDataType(inputs);
+    }
+
+} // namespace infini
diff --git a/src/core/runtime.cc b/src/core/runtime.cc
new file mode 100644
index 0000000..bd88d90
--- /dev/null
+++ b/src/core/runtime.cc
@@ -0,0 +1,36 @@
+#include "core/runtime.h"
+#include "core/blob.h"
+#include "core/kernel.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include <chrono>
+#include <cstring>
+#include <memory>
+namespace infini
+{
+    void NativeCpuRuntimeObj::run(const Graph &graph) const
+    {
+        const auto &kernelRegistry = KernelRegistry::getInstance();
+
+        for (auto &op : graph->getOperators())
+        {
+            auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
+            Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
+            kernel->compute(op, this);
+        }
+    }
+
+    string NativeCpuRuntimeObj::toString() const { return "CPU Runtime"; }
+
+    void NativeCpuRuntimeObj::dealloc(void *ptr)
+    {
+        return free(ptr);
+    }
+
+    void *NativeCpuRuntimeObj::alloc(size_t size)
+    {
+        return calloc((size + sizeof(uint64_t) - 1) / sizeof(uint64_t),
+                      sizeof(uint64_t));
+    }
+
+} // namespace infini
diff --git a/src/core/tensor.cc b/src/core/tensor.cc
new file mode 100644
index 0000000..db54a2d
--- /dev/null
+++ b/src/core/tensor.cc
@@ -0,0 +1,116 @@
+#include "core/tensor.h"
+#include "core/blob.h"
+#include "core/operator.h"
+#include "core/runtime.h"
+#include <cstring>
+#include <numeric>
+
+namespace infini {
+
+    TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime)
+        : dim(shape_.size()), dtype(dtype), runtime(runtime), shape(std::move(shape_)),
+          _size(std::accumulate(shape.begin(), shape.end(), 1, std::multiplies{})) {}
+
+    string TensorObj::toString() const
+    {
+        // Convert data pointer to string
+        std::stringstream ss;
+        if (data != nullptr)
+            ss << data->getPtr<void *>();
+        else
+            ss << "nullptr data";
+        string ret = "Tensor " + std::to_string(guid) + ", Fuid " +
+                     std::to_string(fuid) + ", shape " + vecToString(shape) +
+                     ", dtype " + dtype.toString() + ", " + runtime->toString() +
+                     ", " + ss.str() + "\n";
+        vector<UidBaseType> targetGuids;
+        for (const auto &op : targets)
+            targetGuids.emplace_back(op.lock()->getGuid());
+        if (auto o = source.lock())
+            ret += ", source " + std::to_string(o->getGuid());
+        else
+            ret += ", source None";
+        ret += ", targets " + vecToString(targetGuids);
+        return ret;
+    }
+
+void TensorObj::setShape(Shape shape_) {
+    shape = shape_;
+    size_t size = std::accumulate(shape.begin(), shape.end(), 1,
+                                  [](auto acc, auto x) { return acc * x; });
+    _size = size;
+}
+
+void TensorObj::printData() const {
+    IT_ASSERT(data != nullptr);
+    if (!runtime->isCpu())
+        IT_TODO_HALT();
+
+#define TRY_PRINT(N)                                                           \
+    if (dtype == DataType(N))                                                  \
+        std::cout << dataToString<DT<N>::t>() << std::endl;
+
+    TRY_PRINT(0)           // fmt: new line
+    else TRY_PRINT(1)      //
+        else TRY_PRINT(2)  //
+        else TRY_PRINT(3)  //
+        else TRY_PRINT(4)  //
+        else TRY_PRINT(5)  //
+        else TRY_PRINT(6)  //
+        else TRY_PRINT(7)  //
+        else TRY_PRINT(8)  //
+        else TRY_PRINT(9)  //
+        else TRY_PRINT(10) //
+        else TRY_PRINT(11) //
+        else TRY_PRINT(12) //
+        else TRY_PRINT(13) //
+        else TRY_PRINT(16) //
+        else IT_TODO_HALT();
+
+#undef TRY_PRINT
+}
+
+bool TensorObj::equalData(const Tensor &rhs, double relativeError) const {
+    IT_ASSERT(data != nullptr);
+    IT_ASSERT(rhs->data != nullptr);
+    IT_ASSERT(getDType() == rhs->getDType());
+    IT_ASSERT(runtime->isCpu());
+    IT_ASSERT(rhs->getRuntime()->isCpu());
+    if (size() != rhs->size())
+        return false;
+
+#define TEST_EQUAL(N)                                                          \
+    if (dtype == DataType(N))                                                  \
+        return equalDataImpl(getRawDataPtr<DT<N>::t *>(),                      \
+                             rhs->getRawDataPtr<DT<N>::t *>(), size(),         \
+                             relativeError);
+
+    TEST_EQUAL(0)           // fmt: new line
+    else TEST_EQUAL(1)      //
+        else TEST_EQUAL(2)  //
+        else TEST_EQUAL(3)  //
+        else TEST_EQUAL(4)  //
+        else TEST_EQUAL(5)  //
+        else TEST_EQUAL(6)  //
+        else TEST_EQUAL(7)  //
+        else TEST_EQUAL(8)  //
+        else TEST_EQUAL(9)  //
+        else TEST_EQUAL(10) //
+        else TEST_EQUAL(11) //
+        else TEST_EQUAL(12) //
+        else TEST_EQUAL(13) //
+        else TEST_EQUAL(16) //
+        else IT_TODO_HALT();
+
+#undef TEST_EQUAL
+}
+
+void TensorObj::setData(
+    const std::function<void(void *, size_t, DataType)> &generator) const {
+    IT_ASSERT(data != nullptr);
+    generator(getRawDataPtr<void *>(), size(), dtype);
+}
+
+void TensorObj::setDataBlob(const Blob &blob) { this->data = blob; }
+
+}; // namespace infini
diff --git a/src/kernels/cpu/concat.cc b/src/kernels/cpu/concat.cc
new file mode 100644
index 0000000..6e061c7
--- /dev/null
+++ b/src/kernels/cpu/concat.cc
@@ -0,0 +1,64 @@
+#include "operators/concat.h"
+#include "core/kernel.h"
+
+namespace infini {
+
+class NaiveConcat : public CpuKernelWithoutConfig {
+    template <typename T>
+    void doCompute(const Operator &_op, const RuntimeObj *context) const {
+        auto op = as<ConcatObj>(_op);
+        auto inputs = op->getInputs(), outputs = op->getOutputs();
+        auto dim = op->getDim();
+        auto output = outputs[0];
+        std::vector<Shape> iDims;
+        for (auto input : inputs)
+            iDims.emplace_back(input->getDims());
+        const auto &outDim = output->getDims();
+        size_t blockOffsetInner = 1;
+        for (size_t i = outDim.size() - 1; i > (size_t)dim; --i)
+            blockOffsetInner *= outDim[i];
+        size_t blockOffset = outDim[dim] * blockOffsetInner;
+        for (size_t i = 0; i < inputs.size(); ++i) {
+            auto input = inputs[i];
+            auto dimOffset = 0;
+            auto iDim = iDims[i];
+            for (size_t j = 0; j < i; ++j)
+                dimOffset += iDims[j][dim];
+            size_t localBlockOffset = 1;
+            for (size_t i = iDim.size() - 1;
+                 i >= (size_t)dim && i != (size_t)-1; --i)
+                localBlockOffset *= iDim[i];
+            auto innerOffset = blockOffsetInner * dimOffset;
+            auto inSize = input->size();
+            auto inPtr = input->getRawDataPtr<T *>(),
+                 outPtr = output->getRawDataPtr<T *>();
+#pragma omp parallel for
+            for (size_t iOffset = 0; iOffset < inSize; ++iOffset) {
+                auto oOffset = iOffset % localBlockOffset + innerOffset +
+                               iOffset / localBlockOffset * blockOffset;
+                outPtr[oOffset] = inPtr[iOffset];
+            }
+        }
+    }
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *context) const override {
+#define CASE(N)                                                                \
+    case N:                                                                    \
+        doCompute<DT<N>::t>(_op, context)
+
+        int dataTypeIdx = _op->getDType().getIndex();
+        switch (dataTypeIdx) {
+            CASE(1); // DataType::Float32
+            break;
+            CASE(12); // DataType::UInt32
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+    }
+};
+
+REGISTER_KERNEL(Device::CPU, OpType::Concat, NaiveConcat, "ConcatNaive_CPU");
+
+} // namespace infini
diff --git a/src/kernels/cpu/element_wise.cc b/src/kernels/cpu/element_wise.cc
new file mode 100644
index 0000000..af03c7a
--- /dev/null
+++ b/src/kernels/cpu/element_wise.cc
@@ -0,0 +1,119 @@
+#include "operators/element_wise.h"
+#include "core/kernel.h"
+#include "utils/operator_utils.h"
+
+namespace infini
+{
+    class NativeElementWise : public CpuKernelWithoutConfig
+    {
+        template <typename T>
+        static T addCompute(T val0, T val1)
+        {
+            return val0 + val1;
+        }
+
+        template <typename T>
+        static T subCompute(T val0, T val1)
+        {
+            return val0 - val1;
+        }
+
+        template <typename T>
+        static T mulCompute(T val0, T val1)
+        {
+            return val0 * val1;
+        }
+
+        template <typename T>
+        static T divCompute(T val0, T val1)
+        {
+            return (T)(val0 / val1);
+        }
+
+        template <typename T>
+        void doCompute(const Operator &_op, const RuntimeObj *context) const
+        {
+            auto op = as<ElementWiseObj>(_op);
+            T *inptr0 = op->getInputs(0)->getRawDataPtr<T *>();
+            T *inptr1 = op->getInputs(1)->getRawDataPtr<T *>();
+            T *outptr = op->getOutput()->getRawDataPtr<T *>();
+
+            auto shapeA = op->getInputs(0)->getDims();
+            auto shapeB = op->getInputs(1)->getDims();
+            auto shapeC = op->getOutput()->getDims();
+            auto rank = op->getOutput()->getRank();
+            Shape a(rank, 1);
+            Shape b(rank, 1);
+            std::copy(shapeA.begin(), shapeA.end(),
+                      a.begin() + (rank - shapeA.size()));
+            std::copy(shapeB.begin(), shapeB.end(),
+                      b.begin() + (rank - shapeB.size()));
+            auto getStride = [&](const Shape &shape)
+            {
+                int p = 1;
+                Shape stride(rank);
+                for (auto i = rank; i > 0; --i)
+                {
+                    stride[i - 1] = p;
+                    p = p * shape[i - 1];
+                }
+                return stride;
+            };
+            Shape strideA = getStride(a);
+            Shape strideB = getStride(b);
+
+            auto n = op->getOutput()->size();
+            T (*_doCompute)
+            (T val0, T val1);
+            switch (op->getOpType().underlying())
+            {
+            case OpType::Add:
+                _doCompute = addCompute<T>;
+                break;
+            case OpType::Sub:
+                _doCompute = subCompute<T>;
+                break;
+            case OpType::Mul:
+                _doCompute = mulCompute<T>;
+                break;
+            case OpType::Div:
+                _doCompute = divCompute<T>;
+                break;
+            default:
+                IT_TODO_HALT();
+            }
+
+            for (size_t i = 0; i < n; ++i)
+            {
+                auto shapeIndexC = locate_index(i, shapeC);
+                auto indexA = delocate_index(shapeIndexC, a, strideA);
+                auto indexB = delocate_index(shapeIndexC, b, strideB);
+                outptr[i] = _doCompute(inptr0[indexA], inptr1[indexB]);
+            }
+        }
+
+        void compute(const Operator &_op,
+                     const RuntimeObj *context) const override
+        {
+#define CASE(N) \
+    case N:     \
+        doCompute<DT<N>::t>(_op, context)
+
+            int dataTypeIdx = _op->getDType().getIndex();
+            switch (dataTypeIdx)
+            {
+                CASE(1); // DataType::Float32
+                break;
+                CASE(12); // DataType::UInt32
+                break;
+            default:
+                IT_TODO_HALT();
+            }
+        }
+    };
+
+    REGISTER_KERNEL(Device::CPU, OpType::Add, NativeElementWise, "addNaive_CPU");
+    REGISTER_KERNEL(Device::CPU, OpType::Sub, NativeElementWise, "subNaive_CPU");
+    REGISTER_KERNEL(Device::CPU, OpType::Mul, NativeElementWise, "mulNaive_CPU");
+    REGISTER_KERNEL(Device::CPU, OpType::Div, NativeElementWise, "divNaive_CPU");
+}; // namespace infini
diff --git a/src/kernels/cpu/transpose.cc b/src/kernels/cpu/transpose.cc
new file mode 100644
index 0000000..46292d4
--- /dev/null
+++ b/src/kernels/cpu/transpose.cc
@@ -0,0 +1,60 @@
+#include "operators/transpose.h"
+#include "core/kernel.h"
+
+namespace infini {
+
+inline Shape idx2Pos(const Shape &shape, size_t idx) {
+    Shape pos = Shape(shape.size(), 0);
+    auto rest = idx, curDimId = shape.size() - 1;
+    while (rest > 0) {
+        pos[curDimId] = rest % shape[curDimId];
+        rest /= shape[curDimId];
+        curDimId--;
+    }
+    return pos;
+}
+
+class NaiveTranspose : public CpuKernelWithoutConfig {
+    template <typename T>
+    void doCompute(const Operator &_op, const RuntimeObj *context) const {
+        auto op = as<TransposeObj>(_op);
+        auto inputs = op->getInputs(), outputs = op->getOutputs();
+        const auto &inDim = inputs[0]->getDims();
+        const auto &perm = op->getPermute();
+
+        size_t inSize = inputs[0]->size();
+        auto inPtr = inputs[0]->getRawDataPtr<T *>(),
+             outPtr = outputs[0]->getRawDataPtr<T *>();
+        // #pragma omp parallel for
+        for (size_t inIdx = 0; inIdx < inSize; ++inIdx) {
+            auto posInput = idx2Pos(inDim, inIdx);
+            int outIdx = 0;
+            for (size_t j = 0, jEnd = perm.size(); j < jEnd; ++j) {
+                outIdx = outIdx * inDim[perm[j]] + posInput[perm[j]];
+            }
+            outPtr[outIdx] = inPtr[inIdx];
+        }
+    }
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *context) const override {
+#define CASE(N)                                                                \
+    case N:                                                                    \
+        doCompute<DT<N>::t>(_op, context)
+
+        int dataTypeIdx = _op->getDType().getIndex();
+        switch (dataTypeIdx) {
+            CASE(1); // DataType::Float32
+            break;
+            CASE(12); // DataType::UInt32
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+    }
+};
+
+REGISTER_KERNEL(Device::CPU, OpType::Transpose, NaiveTranspose,
+                "TransposeNaive_CPU");
+
+} // namespace infini
diff --git a/src/kernels/cpu/unary.cc b/src/kernels/cpu/unary.cc
new file mode 100644
index 0000000..7e60a4d
--- /dev/null
+++ b/src/kernels/cpu/unary.cc
@@ -0,0 +1,106 @@
+#include "operators/unary.h"
+#include "core/constants.h"
+#include "core/kernel.h"
+
+namespace infini
+{
+    class NativeUnary : public CpuKernelWithoutConfig
+    {
+        template <typename T>
+        static T reluCompute(T val)
+        {
+            return std::max(T(0), val);
+        }
+
+        template <typename T>
+        void doCompute(const Operator &_op, const RuntimeObj *context) const
+        {
+            auto op = as<UnaryObj>(_op);
+            T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
+            T *outptr = op->getOutput()->getRawDataPtr<T *>();
+
+            auto outDim = op->getOutput()->getDims();
+            auto n = op->getOutput()->size();
+
+            T (*_doCompute)
+            (T val);
+            switch (op->getOpType().underlying())
+            {
+            case OpType::Relu:
+                _doCompute = reluCompute<T>;
+                break;
+            default:
+                IT_TODO_HALT();
+            }
+
+            for (size_t offset = 0; offset < n; offset++)
+            {
+                outptr[offset] = _doCompute(inptr[offset]);
+            }
+        }
+
+        void compute(const Operator &_op,
+                     const RuntimeObj *context) const override
+        {
+#define CASE(N) \
+    case N:     \
+        doCompute<DT<N>::t>(_op, context)
+
+            int dataTypeIdx = _op->getDType().getIndex();
+            switch (dataTypeIdx)
+            {
+                CASE(1); // DataType::Float32
+                break;
+                CASE(12); // DataType::UInt32
+                break;
+            default:
+                IT_TODO_HALT();
+            }
+        }
+    };
+
+    class Clip : public CpuKernelWithoutConfig
+    {
+        template <typename T>
+        void doCompute(const Operator &_op, const RuntimeObj *context) const
+        {
+            auto op = as<ClipObj>(_op);
+            T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
+            T *outptr = op->getOutput()->getRawDataPtr<T *>();
+            auto minValue = op->getMin();
+            auto maxValue = op->getMax();
+
+            auto n = op->getOutput()->size();
+            for (size_t offset = 0; offset < n; offset++)
+            {
+                auto val = *inptr++;
+                *outptr++ = (minValue && val < *minValue)   ? *minValue
+                            : (maxValue && val > *maxValue) ? *maxValue
+                                                            : val;
+            }
+        }
+
+        void compute(const Operator &_op,
+                     const RuntimeObj *context) const override
+        {
+#define CASE(N) \
+    case N:     \
+        doCompute<DT<N>::t>(_op, context)
+
+            int dataTypeIdx = _op->getDType().getIndex();
+            switch (dataTypeIdx)
+            {
+                CASE(1); // DataType::Float32
+                break;
+                CASE(12); // DataType::UInt32
+                break;
+            default:
+                IT_TODO_HALT();
+            }
+        }
+    };
+
+    REGISTER_KERNEL(Device::CPU, OpType::Relu, NativeUnary, "reluNaive_CPU");
+    REGISTER_KERNEL(Device::CPU, OpType::Clip, Clip, "Clip_CPU");
+
+}; // namespace infini
diff --git a/src/operators/concat.cc b/src/operators/concat.cc
new file mode 100644
index 0000000..d196330
--- /dev/null
+++ b/src/operators/concat.cc
@@ -0,0 +1,38 @@
+#include "operators/concat.h"
+#include "utils/operator_utils.h"
+
+namespace infini {
+ConcatObj::ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int _dim)
+    : OperatorObj(OpType::Concat, inputs, {output}) {
+    int rank = inputs[0]->getRank();
+    dim = get_real_axis(_dim, rank);
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> ConcatObj::inferShape(const TensorVec &inputs) {
+    Shape dims = inputs[0]->getDims();
+    auto rank = inputs[0]->getRank();
+
+    // =================================== 作业 ===================================
+    // TODO：修改 dims，返回正确的 concat 后的 shape
+    // REF: https://onnx.ai/onnx/operators/onnx__Concat.html#concat-13
+    // =================================== 作业 ===================================
+
+    return {{dims}};
+}
+
+std::string ConcatObj::toString() const {
+    std::ostringstream os;
+    os << "Concat[" << getGuid() << "]";
+    os << "(";
+    for (auto input : inputs)
+        os << vecToString(input->getDims()) << ",";
+    os << "dim=" << dim << ",";
+    os << "input=";
+    for (auto input : inputs)
+        os << input->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+} // namespace infini
diff --git a/src/operators/element_wise.cc b/src/operators/element_wise.cc
new file mode 100644
index 0000000..c1b4ef1
--- /dev/null
+++ b/src/operators/element_wise.cc
@@ -0,0 +1,33 @@
+#include "operators/element_wise.h"
+#include "utils/operator_utils.h"
+
+namespace infini
+{
+    ElementWiseObj::ElementWiseObj(OpType type, GraphObj *graph, Tensor input0,
+                                   Tensor input1, Tensor output)
+        : OperatorObj(type, {input0, input1}, {output})
+    {
+        IT_ASSERT(checkValid(graph));
+    }
+
+    optional<vector<Shape>> ElementWiseObj::inferShape(const TensorVec &inputs)
+    {
+        const auto A = inputs[0], B = inputs[1];
+        auto res = infer_broadcast(A->getDims(), B->getDims());
+        return {{res}};
+    }
+
+    std::string ElementWiseObj::toString() const
+    {
+        std::ostringstream os;
+        os << type.toString() << "[" << getGuid() << "]";
+        os << "(";
+        os << vecToString(inputs[0]->getDims()) << ",";
+        os << vecToString(inputs[1]->getDims()) << ",";
+        os << "input0=" << inputs[0]->getGuid() << ",";
+        os << "input1=" << inputs[1]->getGuid() << ",";
+        os << "output=" << outputs[0]->getGuid() << ")";
+        return os.str();
+    }
+
+}; // namespace infini
diff --git a/src/operators/matmul.cc b/src/operators/matmul.cc
new file mode 100644
index 0000000..0ce94a1
--- /dev/null
+++ b/src/operators/matmul.cc
@@ -0,0 +1,33 @@
+#include "operators/matmul.h"
+
+namespace infini
+{
+
+    MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA,
+                         bool transB)
+        : OperatorObj(OpType::MatMul, TensorVec{A, B}, {C}),
+          transA(transA), transB(transB)
+    {
+        IT_ASSERT(checkValid(graph));
+    }
+
+    string MatmulObj::toString() const
+    {
+        std::ostringstream os;
+        os << "Matmul([" << (transA ? "A^T" : "A") << "," << (transB ? "B^T" : "B]")
+           << ",A=" << inputs[0]->getGuid()
+           << ",B=" << inputs[1]->getGuid() << ",C=" << outputs[0]->getGuid()
+           << ",mnk=[" << m << "," << n << "," << k << "])";
+        return os.str();
+    }
+
+    optional<vector<Shape>> MatmulObj::inferShape(const TensorVec &inputs)
+    {
+        // =================================== 作业 ===================================
+        // TODO：返回经过 matmul 操作后的 shape
+        // REF: https://github.com/onnx/onnx/blob/main/docs/Operators.md#gemm
+        // =================================== 作业 ===================================
+        return {{}};
+    }
+
+} // namespace infini
\ No newline at end of file
diff --git a/src/operators/transpose.cc b/src/operators/transpose.cc
new file mode 100644
index 0000000..f5e9659
--- /dev/null
+++ b/src/operators/transpose.cc
@@ -0,0 +1,50 @@
+#include "operators/transpose.h"
+
+namespace infini
+{
+    TransposeObj::TransposeObj(GraphObj *graph, Tensor input, Tensor output,
+                               vector<int> permute)
+        : OperatorObj(OpType::Transpose, {input}, {output})
+    {
+        auto rank = input->getRank();
+        if (permute.empty())
+        {
+            for (size_t i = 0; i < rank; ++i)
+            {
+                transposePermute[i] = i;
+            }
+        }
+        else
+        {
+            IT_ASSERT(rank == permute.size());
+            transposePermute = std::move(permute);
+        }
+        IT_ASSERT(checkValid(graph));
+    }
+
+    optional<vector<Shape>> TransposeObj::inferShape(const TensorVec &inputs)
+    {
+        const auto A = inputs[0];
+        auto input_dim = A->getDims();
+        auto output_dim = input_dim;
+        int rank = A->getRank();
+
+        // =================================== 作业 ===================================
+        // TODO：修改 output_dim，返回正确的 transpose 后的 shape
+        // REF: https://onnx.ai/onnx/operators/onnx__Transpose.html#transpose-21
+        // =================================== 作业 ===================================
+
+        return {{}};
+    }
+
+    std::string TransposeObj::toString() const
+    {
+        std::ostringstream os;
+        os << type.toString() << "[" << getGuid() << "]";
+        os << "(";
+        os << vecToString(inputs[0]->getDims()) << ",";
+        os << "input=" << inputs[0]->getGuid() << ",";
+        os << "output=" << outputs[0]->getGuid() << ")";
+        return os.str();
+    }
+}; // namespace infini
diff --git a/src/operators/unary.cc b/src/operators/unary.cc
new file mode 100644
index 0000000..e773506
--- /dev/null
+++ b/src/operators/unary.cc
@@ -0,0 +1,148 @@
+#include "operators/unary.h"
+
+namespace infini
+{
+    UnaryObj::UnaryObj(OpType type, GraphObj *graph, Tensor input, Tensor output)
+        : OperatorObj(type, {input}, {output})
+    {
+        IT_ASSERT(checkValid(graph));
+    }
+
+    optional<vector<Shape>> UnaryObj::inferShape(const TensorVec &inputs)
+    {
+        const auto A = inputs[0];
+        return {{A->getDims()}};
+    }
+
+    std::string UnaryObj::toString() const
+    {
+        std::ostringstream os;
+        os << type.toString() << "[" << getGuid() << "]";
+        os << "(";
+        os << vecToString(inputs[0]->getDims()) << ",";
+        os << "input=" << inputs[0]->getGuid() << ",";
+        os << "output=" << outputs[0]->getGuid() << ")";
+        return os.str();
+    }
+
+    ClipObj::ClipObj(GraphObj *graph, Tensor input, Tensor output,
+                     std::optional<float> min, std::optional<float> max)
+        : OperatorObj(OpType::Clip, {input}, {output}), minValue(min),
+          maxValue(max)
+    {
+        IT_ASSERT(checkValid(graph));
+    }
+
+    optional<vector<Shape>> ClipObj::inferShape(const TensorVec &inputs)
+    {
+        // =================================== 作业 ===================================
+        // TODO：返回经过 clip 操作后的 shape
+        // REF: https://onnx.ai/onnx/operators/onnx__Clip.html#clip-13
+        // =================================== 作业 ===================================
+        return {{}};
+    }
+
+    std::string ClipObj::toString() const
+    {
+        std::ostringstream os;
+        os << type.toString() << "[" << getGuid() << "]";
+        os << "(";
+        os << vecToString(inputs[0]->getDims()) << ",";
+        os << "input=" << inputs[0]->getGuid() << ",";
+        os << "output=" << outputs[0]->getGuid() << ")";
+        return os.str();
+    }
+
+    CastObj::CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type)
+        : OperatorObj(OpType::Cast, {input}, {output}), castType(type)
+    {
+        IT_ASSERT(checkValid(graph));
+    }
+
+    vector<DataType> CastObj::inferDataType(const TensorVec &inputs) const
+    {
+        // =================================== 作业 ===================================
+        // TODO：返回经过 cast 操作后, 输出 tensor 的数目和数据类型
+        // REF_FILE: src/core/operator.cc
+        // REF: https://onnx.ai/onnx/operators/onnx__Cast.html#cast-21
+        // =================================== 作业 ===================================
+        return {};
+    }
+
+    optional<vector<Shape>> CastObj::inferShape(const TensorVec &inputs)
+    {
+        // =================================== 作业 ===================================
+        // TODO：返回经过 cast 操作后的 shape
+        // REF: https://onnx.ai/onnx/operators/onnx__Cast.html#cast-21
+        // =================================== 作业 ===================================
+        return {{}};
+    }
+
+    std::string CastObj::toString() const
+    {
+        std::ostringstream os;
+        os << type.toString() << "[" << getGuid() << "]";
+        os << "(";
+        os << "output=" << outputs[0]->getGuid() << ")";
+        return os.str();
+    }
+
+    DataType CastObj::getOutputDataType() const
+    {
+        switch (castType)
+        {
+        case CastType::Float2Float16:
+            return DataType::Float16;
+        case CastType::Float2Int64:
+            return DataType::Int64;
+        case CastType::Float2Int32:
+            return DataType::Int32;
+        case CastType::Float2Int16:
+            return DataType::Int16;
+        case CastType::Float2Int8:
+            return DataType::Int8;
+        case CastType::Int322Float:
+            return DataType::Float32;
+        case CastType::Int322Int8:
+            return DataType::Int8;
+        case CastType::Int322Int16:
+            return DataType::Int16;
+        case CastType::Int162Float:
+            return DataType::Float32;
+        case CastType::Int162Int32:
+            return DataType::Int32;
+        case CastType::Int82Float:
+            return DataType::Float32;
+        case CastType::Int82Int16:
+            return DataType::Int16;
+        case CastType::Int82Int32:
+            return DataType::Int32;
+        case CastType::Uint82Float:
+            return DataType::Float32;
+        case CastType::Uint82Int32:
+            return DataType::Int32;
+        case CastType::Uint82Int64:
+            return DataType::Int64;
+        case CastType::Int322Int64:
+            return DataType::Int64;
+        case CastType::Int642Int32:
+            return DataType::Int32;
+        case CastType::Int642Uint32:
+            return DataType::UInt32;
+        case CastType::Int642Float:
+            return DataType::Float32;
+        case CastType::Uint322Int64:
+            return DataType::Int64;
+        case CastType::Float162Float:
+            return DataType::Float32;
+        case CastType::BFloat162Float:
+            return DataType::Float32;
+        case CastType::Float2BFloat16:
+            return DataType::BFloat16;
+        case CastType::Float2Float:
+            return DataType::Float32;
+        default:
+            IT_TODO_HALT();
+        }
+    }
+}; // namespace infini
diff --git a/src/utils/exception.cc b/src/utils/exception.cc
new file mode 100644
index 0000000..228a39a
--- /dev/null
+++ b/src/utils/exception.cc
@@ -0,0 +1,5 @@
+#include "utils/exception.h"
+
+namespace infini {
+Exception::Exception(const std::string &msg) : std::runtime_error(msg) {}
+} // namespace infini
diff --git a/src/utils/operator_utils.cc b/src/utils/operator_utils.cc
new file mode 100644
index 0000000..edbd2c8
--- /dev/null
+++ b/src/utils/operator_utils.cc
@@ -0,0 +1,69 @@
+#include "utils/operator_utils.h"
+#include "core/runtime.h"
+
+namespace infini {
+
+Shape infer_broadcast(const Shape &A, const Shape &B) {
+
+    // =================================== 作业 ===================================
+    // TODO：对 A 和 B 进行双向广播，返回广播后的形状。
+    // REF: https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
+    // =================================== 作业 ===================================
+    
+    return {};
+}
+
+int get_real_axis(const int &axis, const int &rank) {
+    IT_ASSERT(rank >= 1);
+    IT_ASSERT(axis >= -rank && axis <= (rank - 1));
+    int newAxis;
+    if (axis < 0) {
+        newAxis = rank + axis;
+    } else {
+        newAxis = axis;
+    }
+    return newAxis;
+}
+
+Shape locate_index(size_t inputN, const Shape &shape) {
+    Shape ans(shape.size());
+    auto i = ans.rbegin();
+    auto j = shape.rbegin(), ej = shape.rend();
+    while (j != ej) {
+        auto div = std::div(inputN, *j++);
+        *i++ = div.rem;
+        inputN = div.quot;
+    }
+    return ans;
+}
+
+size_t delocate_index(const Shape &shapeIndex, const Shape &shape,
+                      const Shape &stride) {
+    size_t ans = 0;
+    Shape index(shapeIndex.size());
+    IT_ASSERT(shapeIndex.size() == shape.size());
+    IT_ASSERT(shape.size() == stride.size());
+    for (size_t i = 0; i < shape.size(); ++i) {
+        index[i] = shapeIndex[i] % shape[i];
+        ans += index[i] * stride[i];
+    }
+    return ans;
+}
+
+std::string device_to_str(Device device) {
+    std::string deviceStr;
+    switch (device) {
+    case Device::CPU:
+        return "CPU";
+    default:
+        IT_TODO_HALT();
+    }
+}
+
+std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs) {
+    std::string deviceStr = device_to_str(std::get<0>(kernelAttrs));
+    std::string opStr = OpType(std::get<1>(kernelAttrs)).toString();
+    return deviceStr + ", " + opStr;
+}
+
+} // namespace infini
diff --git a/test/core/test_allocator.cc b/test/core/test_allocator.cc
new file mode 100644
index 0000000..0515edc
--- /dev/null
+++ b/test/core/test_allocator.cc
@@ -0,0 +1,74 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini
+{
+    TEST(Allocator, testAlloc)
+    {
+        Shape shape = Shape{1, 2, 2, 3};
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor d = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Allocator allocator = Allocator(runtime);
+        // allocate a->b->c
+        size_t offsetA = allocator.alloc(a->getBytes());
+        size_t offsetB = allocator.alloc(b->getBytes());
+        size_t offsetC = allocator.alloc(c->getBytes());
+        // free b, then allocate d
+        allocator.free(offsetB, b->getBytes());
+        size_t offsetD = allocator.alloc(d->getBytes());
+        // expected to be a->d->c
+        EXPECT_EQ(offsetB, offsetD);
+        ASSERT_FALSE(offsetA == 0 && offsetB == 0 && offsetC == 0 && offsetD == 0);
+    }
+
+    TEST(Allocator, testAllocWithEndFreeBlock)
+    {
+        Shape shape = Shape{1, 2, 2, 3};
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor d =
+            make_ref<TensorObj>(Shape{2, 2, 2, 3}, DataType::Float32, runtime);
+        Allocator allocator = Allocator(runtime);
+        // allocate a->b->c
+        allocator.alloc(a->getBytes());
+        allocator.alloc(b->getBytes());
+        size_t offsetC = allocator.alloc(c->getBytes());
+        allocator.info();
+        // free c, then allocate d
+        allocator.free(offsetC, c->getBytes());
+        size_t offsetD = allocator.alloc(d->getBytes());
+        allocator.info();
+        // expected to be a->b->d, with no free block between b and c
+        EXPECT_EQ(offsetC, offsetD);
+    }
+
+    TEST(Allocator, testGetPtr)
+    {
+        Shape shape = Shape{1, 2, 2, 3};
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor d = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Allocator allocator = Allocator(runtime);
+        // allocate a->b->c->d
+        allocator.alloc(a->getBytes());
+        allocator.alloc(b->getBytes());
+        allocator.alloc(c->getBytes());
+        allocator.alloc(d->getBytes());
+        // multiple calls to the getPtr() function should return the same pointer
+        void *ptr1 = allocator.getPtr();
+        void *ptr2 = allocator.getPtr();
+        EXPECT_EQ(ptr1, ptr2);
+    }
+
+} // namespace infini
diff --git a/test/core/test_graph.cc b/test/core/test_graph.cc
new file mode 100644
index 0000000..89c22dd
--- /dev/null
+++ b/test/core/test_graph.cc
@@ -0,0 +1,40 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/matmul.h"
+#include "operators/transpose.h"
+
+#include "test.h"
+
+namespace infini
+{
+    TEST(Graph, Optimize)
+    {
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i1 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
+        Tensor i2 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
+        Tensor t1 = g->addTensor({2, 3, 5, 4}, DataType::UInt32);
+        Tensor t2 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
+        Tensor t3 = g->addTensor({2, 3, 5, 4}, DataType::UInt32);
+        Tensor o = g->addTensor({2, 3, 4, 4}, DataType::UInt32);
+        g->addOpWithOutputs<TransposeObj>(i1, t1, Shape{0, 1, 3, 2});
+        g->addOpWithOutputs<TransposeObj>(t1, t2, Shape{0, 1, 3, 2});
+        g->addOpWithOutputs<TransposeObj>(i2, t3, Shape{0, 1, 3, 2});
+        g->addOpWithOutputs<MatmulObj>(t2, t3, o);
+        // 优化前
+        g->print();
+        g->optimize();
+        // 优化后
+        g->print();
+        EXPECT_EQ(g->getOperators().size(), 1);
+        EXPECT_EQ(g->getTensors().size(), 3);
+        EXPECT_EQ(g->getOperators()[0]->getOpType().underlying(), 8);
+        auto op = as<MatmulObj>(g->getOperators()[0]);
+        EXPECT_EQ(op->getInputs(0)->getGuid(), 2);
+        EXPECT_EQ(op->getInputs(1)->getGuid(), 3);
+        EXPECT_EQ(op->getOutputs()[0], o);
+        EXPECT_EQ(op->getTransA(), false);
+        EXPECT_EQ(op->getTransB(), true);
+    }
+}
\ No newline at end of file
diff --git a/test/kernels/nativecpu/test_nativecpu_concat.cc b/test/kernels/nativecpu/test_nativecpu_concat.cc
new file mode 100644
index 0000000..fc87fb1
--- /dev/null
+++ b/test/kernels/nativecpu/test_nativecpu_concat.cc
@@ -0,0 +1,28 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/concat.h"
+
+#include "test.h"
+
+namespace infini {
+
+TEST(Concat, NativeCpu) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+
+    auto t1 = g->addTensor({2, 2, 3, 1}, DataType::Float32);
+    auto t2 = g->addTensor({2, 2, 1, 1}, DataType::Float32);
+    auto t3 = g->addTensor({2, 2, 2, 1}, DataType::Float32);
+    auto op = g->addOp<ConcatObj>(TensorVec{t1, t2, t3}, nullptr, 2);
+    g->dataMalloc();
+    t1->setData(IncrementalGenerator());
+    t2->setData(OneGenerator());
+    t3->setData(OneGenerator());
+
+    runtime->run(g);
+    EXPECT_TRUE(op->getOutput()->equalData(
+        vector<float>{0, 1, 2, 1, 1, 1, 3, 4,  5,  1, 1, 1,
+                      6, 7, 8, 1, 1, 1, 9, 10, 11, 1, 1, 1}));
+}
+
+} // namespace infini
diff --git a/test/kernels/nativecpu/test_nativecpu_elementwise.cc b/test/kernels/nativecpu/test_nativecpu_elementwise.cc
new file mode 100644
index 0000000..c6ef191
--- /dev/null
+++ b/test/kernels/nativecpu/test_nativecpu_elementwise.cc
@@ -0,0 +1,44 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+using ExpectOutput = vector<float>;
+template <class T>
+void testElementWiseNativeCpu(
+    const std::function<void(void *, size_t, DataType)> &generator1,
+    const std::function<void(void *, size_t, DataType)> &generator2,
+    const Shape &shape1, const Shape &shape2, const ExpectOutput &ansVec) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+    auto t1 = g->addTensor(shape1, DataType::Float32);
+    auto t2 = g->addTensor(shape2, DataType::Float32);
+
+    auto op = g->addOp<T>(t1, t2, nullptr);
+    g->dataMalloc();
+    t1->setData(generator1);
+    t2->setData(generator2);
+
+    runtime->run(g);
+    EXPECT_TRUE(op->getOutput()->equalData(ansVec));
+}
+
+TEST(ElementWise, NativeCpu) {
+    testElementWiseNativeCpu<AddObj>(
+        IncrementalGenerator(), IncrementalGenerator(), Shape{1, 2, 2, 3, 1},
+        Shape{2, 1, 1}, ExpectOutput{0, 1, 2, 4, 5, 6, 6, 7, 8, 10, 11, 12});
+    testElementWiseNativeCpu<MulObj>(
+        IncrementalGenerator(), IncrementalGenerator(), Shape{1, 2, 2, 3, 1},
+        Shape{2, 1, 1}, ExpectOutput{0, 0, 0, 3, 4, 5, 0, 0, 0, 9, 10, 11});
+    testElementWiseNativeCpu<SubObj>(
+        IncrementalGenerator(), IncrementalGenerator(), Shape{1, 2, 2, 3, 1},
+        Shape{2, 1, 1}, ExpectOutput{0, 1, 2, 2, 3, 4, 6, 7, 8, 8, 9, 10});
+    testElementWiseNativeCpu<DivObj>(
+        IncrementalGenerator(), OneGenerator(), Shape{1, 2, 2, 3, 1},
+        Shape{2, 1, 1}, ExpectOutput{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+}
+
+} // namespace infini
diff --git a/test/kernels/nativecpu/test_nativecpu_transpose.cc b/test/kernels/nativecpu/test_nativecpu_transpose.cc
new file mode 100644
index 0000000..501d402
--- /dev/null
+++ b/test/kernels/nativecpu/test_nativecpu_transpose.cc
@@ -0,0 +1,27 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/transpose.h"
+
+#include "test.h"
+
+namespace infini {
+
+TEST(Transpose, NativeCpu) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+
+    Shape permute = {0, 2, 1, 3};
+    auto input = g->addTensor({1, 2, 3, 4}, DataType::Float32);
+    auto op = g->addOp<TransposeObj>(input, nullptr, permute);
+    g->dataMalloc();
+    input->setData(IncrementalGenerator());
+
+    runtime->run(g);
+
+    EXPECT_TRUE(op->getOutput(0)->equalData(vector<float>{0, 1, 2, 3, 12, 13, 14, 15,
+                                                          4, 5, 6, 7, 16, 17, 18, 19,
+                                                          8, 9, 10, 11, 20, 21, 22, 23}));
+}
+
+} // namespace infini
diff --git a/test/operators/test_cast.cc b/test/operators/test_cast.cc
new file mode 100644
index 0000000..3177751
--- /dev/null
+++ b/test/operators/test_cast.cc
@@ -0,0 +1,23 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini
+{
+
+    TEST(Cast, ShapeInference)
+    {
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            Tensor i0 = g->addTensor({2}, DataType::Float32);
+            auto op = g->addOp<CastObj>(i0, nullptr, CastType::Float2Float16);
+            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2}));
+            EXPECT_EQ(op->getOutDType(), (DataType::Float16));
+        }
+    }
+
+} // namespace infini
diff --git a/test/operators/test_clip.cc b/test/operators/test_clip.cc
new file mode 100644
index 0000000..bd4e07f
--- /dev/null
+++ b/test/operators/test_clip.cc
@@ -0,0 +1,23 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+    TEST(Clip, ShapeInference)
+    {
+        // Runtime
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i0 = g->addTensor({1, 2, 2, 3}, DataType::Float32);
+        float min = 1.0;
+        float max = 4.0;
+        auto op = g->addOp<ClipObj>(i0, nullptr, min, max);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 2, 2, 3}));
+        EXPECT_EQ(op->getOutDType(), (DataType::Float32));
+    }
+
+} // namespace infini
diff --git a/test/operators/test_concat.cc b/test/operators/test_concat.cc
new file mode 100644
index 0000000..8984b9f
--- /dev/null
+++ b/test/operators/test_concat.cc
@@ -0,0 +1,16 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/concat.h"
+#include "test.h"
+
+namespace infini {
+TEST(Concat, ShapeInfer) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+    auto t1 = g->addTensor({1, 3, 2, 4}, DataType::Float32);
+    auto t2 = g->addTensor({1, 3, 2, 5}, DataType::Float32);
+
+    auto op = g->addOp<ConcatObj>(TensorVec{t1, t2}, nullptr, 3);
+    EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 9}));
+}
+} // namespace infini
diff --git a/test/operators/test_element_wise.cc b/test/operators/test_element_wise.cc
new file mode 100644
index 0000000..f4fdd66
--- /dev/null
+++ b/test/operators/test_element_wise.cc
@@ -0,0 +1,66 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+    TEST(ElementWise, ShapeInference)
+    {
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            Tensor i0 = g->addTensor({2, 3, 3, 4}, DataType::UInt32);
+            Tensor i1 = g->addTensor({2, 3, 3, 4}, DataType::UInt32);
+            auto op = g->addOp<AddObj>(i0, i1, nullptr);
+            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 3, 4}));
+        }
+    }
+
+    TEST(ElementWise, Broadcasting)
+    {
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            Tensor i0 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
+            Tensor i1 = g->addTensor({}, DataType::UInt32);
+            auto op = g->addOp<AddObj>(i0, i1, nullptr);
+            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 5}));
+        }
+
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            Tensor i0 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
+            Tensor i1 = g->addTensor({5}, DataType::UInt32);
+            auto op = g->addOp<AddObj>(i0, i1, nullptr);
+            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 5}));
+        }
+
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            Tensor i0 = g->addTensor({4, 5}, DataType::UInt32);
+            Tensor i1 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
+            auto op = g->addOp<AddObj>(i0, i1, nullptr);
+            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 5}));
+        }
+
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            Tensor i0 = g->addTensor({1, 4, 5}, DataType::UInt32);
+            Tensor i1 = g->addTensor({2, 3, 1, 1}, DataType::UInt32);
+            auto op = g->addOp<AddObj>(i0, i1, nullptr);
+            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 5}));
+        }
+
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            Tensor i0 = g->addTensor({3, 4, 5}, DataType::UInt32);
+            Tensor i1 = g->addTensor({2, 1, 1, 1}, DataType::UInt32);
+            auto op = g->addOp<AddObj>(i0, i1, nullptr);
+            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 5}));
+        }
+    }
+
+} // namespace infini
diff --git a/test/operators/test_matmul.cc b/test/operators/test_matmul.cc
new file mode 100644
index 0000000..32fbc36
--- /dev/null
+++ b/test/operators/test_matmul.cc
@@ -0,0 +1,57 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/matmul.h"
+
+#include "test.h"
+
+namespace infini
+{
+    using ExpectOutput = vector<float>;
+
+    TEST(Matmul, ShapeInference)
+    {
+        auto runtime = NativeCpuRuntimeObj::getInstance();
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            auto A = g->addTensor(Shape{1, 3, 5});
+            auto B = g->addTensor(Shape{1, 5, 2});
+            auto matmul = g->addOp<MatmulObj>(A, B, nullptr);
+            auto C = matmul->getOutputs()[0];
+            EXPECT_EQ(C->getDims(), (Shape{1, 3, 2}));
+        }
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            auto A = g->addTensor(Shape{3, 5, 4});
+            auto B = g->addTensor(Shape{3, 5, 2});
+            auto matmul = g->addOp<MatmulObj>(A, B, nullptr, true, false);
+            auto C = matmul->getOutputs()[0];
+            EXPECT_EQ(C->getDims(), (Shape{3, 4, 2}));
+        }
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            auto A = g->addTensor(Shape{1, 2, 3, 5});
+            auto B = g->addTensor(Shape{1, 1, 5, 2});
+            auto matmul = g->addOp<MatmulObj>(A, B, nullptr);
+            auto C = matmul->getOutputs()[0];
+            EXPECT_EQ(C->getDims(), (Shape{1, 2, 3, 2}));
+        }
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            auto A = g->addTensor(Shape{2, 3, 5, 4});
+            auto B = g->addTensor(Shape{1, 3, 5, 2});
+            auto matmul = g->addOp<MatmulObj>(A, B, nullptr, true, false);
+            auto C = matmul->getOutputs()[0];
+            EXPECT_EQ(C->getDims(), (Shape{2, 3, 4, 2}));
+        }
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            auto A = g->addTensor(Shape{2, 3, 5, 4});
+            auto B = g->addTensor(Shape{1, 3, 2, 5});
+            auto matmul = g->addOp<MatmulObj>(A, B, nullptr, true, true);
+            auto C = matmul->getOutputs()[0];
+            EXPECT_EQ(C->getDims(), (Shape{2, 3, 4, 2}));
+        }
+    }
+
+}; // namespace infini
\ No newline at end of file
diff --git a/test/operators/test_transpose.cc b/test/operators/test_transpose.cc
new file mode 100644
index 0000000..1c12b79
--- /dev/null
+++ b/test/operators/test_transpose.cc
@@ -0,0 +1,32 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/transpose.h"
+
+#include "test.h"
+
+namespace infini {
+
+TEST(Transpose, ShapeInference) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i = g->addTensor({1, 2, 3, 4}, DataType::Float32);
+        auto op = g->addOp<TransposeObj>(i, nullptr, Shape{0, 1, 2, 3});
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 2, 3, 4}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i = g->addTensor({1, 2, 3, 4}, DataType::Float32);
+        auto op = g->addOp<TransposeObj>(i, nullptr, Shape{0, 2, 1, 3});
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 4}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i = g->addTensor({2, 3, 4}, DataType::Float32);
+        auto op = g->addOp<TransposeObj>(i, nullptr, Shape{0, 2, 1});
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 4, 3}));
+    }
+}
+
+} // namespace infini