icppWorld · icppWorld · Sep 6, 2023 · Sep 3, 2023 · Sep 5, 2023 · Sep 5, 2023
diff --git a/README.md b/README.md
@@ -4,8 +4,6 @@
 
 ![icpp-llm logo](./assets/icpp-llm-logo.png)
 
-Video: [How to run llama2.c on the Internet Computer](https://www.loom.com/share/a065b678df63462fb2f637d1b550b5d2?sid=1aeee693-25c0-4d1f-be0c-8231b53eb742)
-
 # The Benefits of Running LLMs On-Chain
 
 The canisters within the Internet Computer have certain constraints. They come with memory restrictions, and there's a cap on the number of instructions one can execute per message, as discussed [here](https://forum.dfinity.org/t/instruction-limit-is-crushing-me/22070/10?u=icpp).
@@ -23,21 +21,6 @@ Coherent English?](https://arxiv.org/pdf/2305.07759.pdf)
 
 Besides the ease of use and the enhanced security, running LLMs directly on-chain also facilitates a seamless integration of tokenomics, eliminating the need to juggle between a complex blend of web3 and web2 components, and I believe it will lead to a new category of Generative AI based dApps.
 
-# Try it
-
-`icpp_llama2` with the stories15M.bin is running on-chain in canister `4c4bn-daaaa-aaaag-abvcq-cai`. 
-
-You can call it's inference endpoint with:
-
-```bash
-dfx canister call --network ic 4c4bn-daaaa-aaaag-abvcq-cai inference '(record {prompt = "" : text; steps = 20 : nat64; temperature = 0.8 : float32; topp = 1.0 : float32;})'
-(
-  variant {
-    ok = "Once upon a time, there was a little boat named Bob. Bob loved to float on the water"
-  },
-)
-```
-
 
 ## Instructions
 

diff --git a/assets/icpp_llama2_without_limits.png b/assets/icpp_llama2_without_limits.png
diff --git a/icpp_llama2/README.md b/icpp_llama2/README.md
@@ -1,22 +1,5 @@
 # [karpathy/llama2.c](https://github.com/karpathy/llama2.c) for the Internet Computer
 
-Video: [How to run llama2.c on the Internet Computer](https://www.loom.com/share/a065b678df63462fb2f637d1b550b5d2?sid=1aeee693-25c0-4d1f-be0c-8231b53eb742)
-
-# Try it
-
-`icpp_llama2` with the stories15M.bin is running on-chain in canister `4c4bn-daaaa-aaaag-abvcq-cai`. 
-
-You can call it's inference endpoint with:
-
-```bash
-dfx canister call --network ic 4c4bn-daaaa-aaaag-abvcq-cai inference '(record {prompt = "" : text; steps = 20 : nat64; temperature = 0.8 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
-(
-  variant {
-    ok = "Once upon a time, there was a little boat named Bob. Bob loved to float on the water"
-  },
-)
-```
-
 # Instructions
 
 - Install the C++ development environment for the Internet Computer ([docs](https://docs.icpp.world/installation.html)):
@@ -53,20 +36,14 @@ dfx canister call --network ic 4c4bn-daaaa-aaaag-abvcq-cai inference '(record {p
    Invoke-WebRequest -Uri https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin -OutFile .\models\stories15M.bin
    ```
 
-- The *demo* script starts the local network, deploys llama2, and uploads the model & tokenizer:
+- The *demo* script starts the local network, deploys llama2, uploads the model & tokenizer, and generates two stories:
   - `./demo.sh`  , on Linux / Mac
   - `.\demo.ps1` , in Windows PowerShell (Miniconda recommended)
 
+        This screenshot shows the generation of the second story:
+
+![icpp_llama2_without_limits](../assets/icpp_llama2_without_limits.png)
 
-- Call the llama2 canister's *inference* endpoint:
-  ```bash
-  dfx canister call llama2 inference '(record {prompt = "" : text; steps = 20 : nat64; temperature = 0.9 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
-  (
-    variant {
-      ok = "Once upon a time, there was a little boat named Bob. Bob loved to float on the water"
-    },
-  )
-  ```
 
 # stories260K
 
@@ -96,12 +73,6 @@ For testing, it is nice to be able to work with a smaller model & tokenizer:
   )
   ```
 
-
-# Limitations
-
-- When using stories15M.bin with the default tokenizer, you are able to generate a story of ~20 steps. When asking for longer stories, the canister throws an error due to a current limit on number of instructions per message.
-- The weights are stored in static/global memory, so they are Orthogonally Persisted. Canisters have a 4 Gb limit at the moment.
-
 # Fine tuning
 
   When making your own checkpoint via fine-tuning, make sure to train with the correct version of [karpathy/llama2.c](https://github.com/karpathy/llama2.c):

diff --git a/icpp_llama2/demo.ps1 b/icpp_llama2/demo.ps1
@@ -68,9 +68,37 @@ wsl --% dfx canister call llama2 ready
 #######################################################################
 Write-Host " "
 Write-Host "--------------------------------------------------"
-Write-Host "Inference call  (Generate a story...)"
-wsl --% dfx canister call llama2 inference '(record {prompt = "" : text; steps = 19 : nat64; temperature = 0.9 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
-wsl --% dfx canister call llama2 inference '(record {prompt = "Lilly " : text; steps = 19 : nat64; temperature = 0.9 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+Write-Host "Generate a new story, 10 tokens at a time, starting with an empty prompt."
+wsl --% dfx canister call llama2 new_chat '()'
+wsl --% dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+
+Write-Host " "
+Write-Host "--------------------------------------------------"
+Write-Host "Generate a new story, 10 tokens at a time, using a starting prompt"
+wsl --% dfx canister call llama2 new_chat '()'
+# You can build the prompt in multiple calls
+wsl --% dfx canister call llama2 inference '(record {prompt = "Lilly went to"           : text; steps = 0  : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = "the beach this morning." : text; steps = 0  : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = "She saw a little boat"   : text; steps = 0  : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = "with her friend Billy"   : text; steps = 0  : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+# Followed by building out the story
+wsl --% dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+wsl --% dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
 
 # #######################################################################
 # Write-Host " "

diff --git a/icpp_llama2/demo.sh b/icpp_llama2/demo.sh
@@ -47,9 +47,37 @@ dfx canister call llama2 ready
 #######################################################################
 echo " "
 echo "--------------------------------------------------"
-echo "Inference call  (Generate a story...)"
-dfx canister call llama2 inference '(record {prompt = "" : text; steps = 19 : nat64; temperature = 0.9 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
-dfx canister call llama2 inference '(record {prompt = "Lilly " : text; steps = 19 : nat64; temperature = 0.9 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+echo "Generate a new story, 10 tokens at a time, starting with an empty prompt."
+dfx canister call llama2 new_chat '()'
+dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = "" : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+
+echo " "
+echo "--------------------------------------------------"
+echo "Generate a new story, 10 tokens at a time, using a starting prompt"
+dfx canister call llama2 new_chat '()'
+# You can build the prompt in multiple calls
+dfx canister call llama2 inference '(record {prompt = "Lilly went to"           : text; steps = 0  : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = "the beach this morning." : text; steps = 0  : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = "She saw a little boat"   : text; steps = 0  : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = "with her friend Billy"   : text; steps = 0  : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+# Followed by building out the story
+dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
+dfx canister call llama2 inference '(record {prompt = ""                        : text; steps = 10 : nat64; temperature = 0.0 : float32; topp = 0.9 : float32; rng_seed = 0 : nat64;})'
 
 # #######################################################################
 # echo " "

diff --git a/icpp_llama2/icpp.toml b/icpp_llama2/icpp.toml
@@ -1,20 +1,8 @@
 [build-wasm]
 canister          = "llama2"
 did_path          = "src/llama2.did"
-cpp_paths         = [
-                        "src/canister.cpp",
-                        "src/http.cpp",
-                        "src/upload.cpp",
-                        "src/initialize.cpp",
-                        "src/inference.cpp",
-                    ]
-cpp_header_paths  = [
-                        "src/canister.h",
-                        "src/http.h",
-                        "src/upload.h",
-                        "src/initialize.h",
-                        "src/inference.h",
-                    ]
+cpp_paths         = ["src/*.cpp"]
+cpp_header_paths  = ["src/*.h"]
 cpp_compile_flags = []
 cpp_link_flags    = []
 c_paths           = ["src/run.c"]