diff --git a/.github/designs/benchmark_dlen1024_vlen1024_fp/t1rocketemu.json b/.github/designs/benchmark_dlen1024_vlen1024_fp/t1rocketemu.json
index 327b004cb..957b028bb 100644
--- a/.github/designs/benchmark_dlen1024_vlen1024_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen1024_vlen1024_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 1975,
-  "eval.ntt_128": 3573,
-  "eval.ntt_256": 5247,
-  "eval.ntt_512": 58013,
-  "eval.ntt_1024": 57728,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 2305,
+  "eval.ntt_128": 4337,
+  "eval.ntt_256": 6100,
+  "eval.ntt_512": 57515,
+  "eval.ntt_1024": 58997,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen1024_vlen2048_fp/t1rocketemu.json b/.github/designs/benchmark_dlen1024_vlen2048_fp/t1rocketemu.json
index 438652fec..5837d21a1 100644
--- a/.github/designs/benchmark_dlen1024_vlen2048_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen1024_vlen2048_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 1975,
-  "eval.ntt_128": 3573,
-  "eval.ntt_256": 5229,
-  "eval.ntt_512": 8099,
-  "eval.ntt_1024": 57728,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 2305,
+  "eval.ntt_128": 4337,
+  "eval.ntt_256": 6083,
+  "eval.ntt_512": 8057,
+  "eval.ntt_1024": 58997,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen1024_vlen4096_fp/t1rocketemu.json b/.github/designs/benchmark_dlen1024_vlen4096_fp/t1rocketemu.json
index 83c2e6f7c..bce8a91fc 100644
--- a/.github/designs/benchmark_dlen1024_vlen4096_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen1024_vlen4096_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 1975,
-  "eval.ntt_128": 3573,
-  "eval.ntt_256": 5229,
-  "eval.ntt_512": 8096,
-  "eval.ntt_1024": 20939,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 2305,
+  "eval.ntt_128": 4337,
+  "eval.ntt_256": 6083,
+  "eval.ntt_512": 8057,
+  "eval.ntt_1024": 21131,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen128_vlen1024_fp/t1rocketemu.json b/.github/designs/benchmark_dlen128_vlen1024_fp/t1rocketemu.json
index 2c265ea46..c74703241 100644
--- a/.github/designs/benchmark_dlen128_vlen1024_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen128_vlen1024_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 2757,
-  "eval.ntt_128": 4738,
-  "eval.ntt_256": 8777,
-  "eval.ntt_512": 58013,
-  "eval.ntt_1024": 57728,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 3087,
+  "eval.ntt_128": 5428,
+  "eval.ntt_256": 9060,
+  "eval.ntt_512": 57515,
+  "eval.ntt_1024": 58997,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen128_vlen128_fp/t1rocketemu.json b/.github/designs/benchmark_dlen128_vlen128_fp/t1rocketemu.json
index c63a22ccf..423803b5a 100644
--- a/.github/designs/benchmark_dlen128_vlen128_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen128_vlen128_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 58036,
-  "eval.ntt_128": 58871,
-  "eval.ntt_256": 58884,
-  "eval.ntt_512": 58013,
-  "eval.ntt_1024": 57728,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 57783,
+  "eval.ntt_128": 57779,
+  "eval.ntt_256": 57922,
+  "eval.ntt_512": 57515,
+  "eval.ntt_1024": 58997,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen128_vlen2048_fp/t1rocketemu.json b/.github/designs/benchmark_dlen128_vlen2048_fp/t1rocketemu.json
index 39d13ae70..4851f4241 100644
--- a/.github/designs/benchmark_dlen128_vlen2048_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen128_vlen2048_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 2757,
-  "eval.ntt_128": 4734,
-  "eval.ntt_256": 8721,
-  "eval.ntt_512": 18629,
-  "eval.ntt_1024": 57728,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 3087,
+  "eval.ntt_128": 5428,
+  "eval.ntt_256": 9560,
+  "eval.ntt_512": 18674,
+  "eval.ntt_1024": 58997,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen128_vlen256_fp/t1rocketemu.json b/.github/designs/benchmark_dlen128_vlen256_fp/t1rocketemu.json
index ce07a3e7f..700353d5d 100644
--- a/.github/designs/benchmark_dlen128_vlen256_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen128_vlen256_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 2757,
-  "eval.ntt_128": 58871,
-  "eval.ntt_256": 58884,
-  "eval.ntt_512": 58013,
-  "eval.ntt_1024": 57728,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 3087,
+  "eval.ntt_128": 57779,
+  "eval.ntt_256": 57922,
+  "eval.ntt_512": 57515,
+  "eval.ntt_1024": 58997,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen128_vlen4096_fp/t1rocketemu.json b/.github/designs/benchmark_dlen128_vlen4096_fp/t1rocketemu.json
index 3c42c0a5c..d04a9522a 100644
--- a/.github/designs/benchmark_dlen128_vlen4096_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen128_vlen4096_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 2757,
-  "eval.ntt_128": 4734,
-  "eval.ntt_256": 8736,
-  "eval.ntt_512": 18627,
-  "eval.ntt_1024": 43035,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 3087,
+  "eval.ntt_128": 5428,
+  "eval.ntt_256": 9560,
+  "eval.ntt_512": 18673,
+  "eval.ntt_1024": 43078,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen128_vlen512_fp/t1rocketemu.json b/.github/designs/benchmark_dlen128_vlen512_fp/t1rocketemu.json
index 43fe157ce..4a30f6553 100644
--- a/.github/designs/benchmark_dlen128_vlen512_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen128_vlen512_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 2757,
-  "eval.ntt_128": 4722,
-  "eval.ntt_256": 58884,
-  "eval.ntt_512": 58013,
-  "eval.ntt_1024": 57728,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 3087,
+  "eval.ntt_128": 5429,
+  "eval.ntt_256": 57922,
+  "eval.ntt_512": 57515,
+  "eval.ntt_1024": 58997,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen256_vlen1024_fp/t1rocketemu.json b/.github/designs/benchmark_dlen256_vlen1024_fp/t1rocketemu.json
index 8d19b941b..b30e4d553 100644
--- a/.github/designs/benchmark_dlen256_vlen1024_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen256_vlen1024_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 2192,
-  "eval.ntt_128": 3469,
-  "eval.ntt_256": 6153,
-  "eval.ntt_512": 58013,
-  "eval.ntt_1024": 57728,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 2522,
+  "eval.ntt_128": 4082,
+  "eval.ntt_256": 6218,
+  "eval.ntt_512": 57515,
+  "eval.ntt_1024": 58997,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen256_vlen2048_fp/t1rocketemu.json b/.github/designs/benchmark_dlen256_vlen2048_fp/t1rocketemu.json
index 2a0fa5e9c..7674364c8 100644
--- a/.github/designs/benchmark_dlen256_vlen2048_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen256_vlen2048_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 2192,
-  "eval.ntt_128": 3469,
-  "eval.ntt_256": 6153,
-  "eval.ntt_512": 10429,
-  "eval.ntt_1024": 57728,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 2522,
+  "eval.ntt_128": 4082,
+  "eval.ntt_256": 6218,
+  "eval.ntt_512": 10473,
+  "eval.ntt_1024": 58997,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen256_vlen256_fp/t1rocketemu.json b/.github/designs/benchmark_dlen256_vlen256_fp/t1rocketemu.json
index 6233f5bd6..0f7057718 100644
--- a/.github/designs/benchmark_dlen256_vlen256_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen256_vlen256_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 2203,
-  "eval.ntt_128": 58871,
-  "eval.ntt_256": 58884,
-  "eval.ntt_512": 58013,
-  "eval.ntt_1024": 57728,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 2533,
+  "eval.ntt_128": 57779,
+  "eval.ntt_256": 57922,
+  "eval.ntt_512": 57515,
+  "eval.ntt_1024": 58997,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen256_vlen4096_fp/t1rocketemu.json b/.github/designs/benchmark_dlen256_vlen4096_fp/t1rocketemu.json
index abd09b0d3..ee6ec7105 100644
--- a/.github/designs/benchmark_dlen256_vlen4096_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen256_vlen4096_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 2192,
-  "eval.ntt_128": 3469,
-  "eval.ntt_256": 6153,
-  "eval.ntt_512": 10852,
-  "eval.ntt_1024": 26612,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 2522,
+  "eval.ntt_128": 4082,
+  "eval.ntt_256": 6218,
+  "eval.ntt_512": 10484,
+  "eval.ntt_1024": 26525,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen256_vlen512_fp/t1rocketemu.json b/.github/designs/benchmark_dlen256_vlen512_fp/t1rocketemu.json
index bfa444787..60ecc84b1 100644
--- a/.github/designs/benchmark_dlen256_vlen512_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen256_vlen512_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 2196,
-  "eval.ntt_128": 3335,
-  "eval.ntt_256": 58884,
-  "eval.ntt_512": 58013,
-  "eval.ntt_1024": 57728,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 2527,
+  "eval.ntt_128": 4093,
+  "eval.ntt_256": 57922,
+  "eval.ntt_512": 57515,
+  "eval.ntt_1024": 58997,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen512_vlen1024_fp/t1rocketemu.json b/.github/designs/benchmark_dlen512_vlen1024_fp/t1rocketemu.json
index aa53562bc..734b209e3 100644
--- a/.github/designs/benchmark_dlen512_vlen1024_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen512_vlen1024_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 2012,
-  "eval.ntt_128": 3278,
-  "eval.ntt_256": 5090,
-  "eval.ntt_512": 58013,
-  "eval.ntt_1024": 57728,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 2340,
+  "eval.ntt_128": 3717,
+  "eval.ntt_256": 6119,
+  "eval.ntt_512": 57515,
+  "eval.ntt_1024": 58997,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen512_vlen2048_fp/t1rocketemu.json b/.github/designs/benchmark_dlen512_vlen2048_fp/t1rocketemu.json
index 7946f0d8b..e9faa7eb2 100644
--- a/.github/designs/benchmark_dlen512_vlen2048_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen512_vlen2048_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 2012,
-  "eval.ntt_128": 3276,
-  "eval.ntt_256": 5090,
-  "eval.ntt_512": 9059,
-  "eval.ntt_1024": 57728,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 2340,
+  "eval.ntt_128": 3717,
+  "eval.ntt_256": 6119,
+  "eval.ntt_512": 9228,
+  "eval.ntt_1024": 58997,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen512_vlen4096_fp/t1rocketemu.json b/.github/designs/benchmark_dlen512_vlen4096_fp/t1rocketemu.json
index f4bbdd2f9..0a841927a 100644
--- a/.github/designs/benchmark_dlen512_vlen4096_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen512_vlen4096_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 2012,
-  "eval.ntt_128": 3276,
-  "eval.ntt_256": 5090,
-  "eval.ntt_512": 9059,
-  "eval.ntt_1024": 22351,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 2340,
+  "eval.ntt_128": 3717,
+  "eval.ntt_256": 6119,
+  "eval.ntt_512": 9223,
+  "eval.ntt_1024": 22370,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/benchmark_dlen512_vlen512_fp/t1rocketemu.json b/.github/designs/benchmark_dlen512_vlen512_fp/t1rocketemu.json
index 914a533bb..255a1686d 100644
--- a/.github/designs/benchmark_dlen512_vlen512_fp/t1rocketemu.json
+++ b/.github/designs/benchmark_dlen512_vlen512_fp/t1rocketemu.json
@@ -1,8 +1,8 @@
 {
-  "eval.ntt_64": 2012,
-  "eval.ntt_128": 3287,
-  "eval.ntt_256": 58884,
-  "eval.ntt_512": 58013,
-  "eval.ntt_1024": 57728,
-  "eval.ntt_4096": 56564
+  "eval.ntt_64": 2340,
+  "eval.ntt_128": 3724,
+  "eval.ntt_256": 57922,
+  "eval.ntt_512": 57515,
+  "eval.ntt_1024": 58997,
+  "eval.ntt_4096": 56320
 }
\ No newline at end of file
diff --git a/.github/designs/blastoise/t1rocketemu.json b/.github/designs/blastoise/t1rocketemu.json
index 18b57abe5..e6917fc34 100644
--- a/.github/designs/blastoise/t1rocketemu.json
+++ b/.github/designs/blastoise/t1rocketemu.json
@@ -1,9 +1,9 @@
 {
-  "asm.memcpy": 1813,
-  "asm.mmm": 52503,
-  "asm.smoke": 8556,
-  "asm.strlen": 14192,
-  "asm.utf8_count": 505,
+  "asm.memcpy": 2260,
+  "asm.mmm": 52837,
+  "asm.smoke": 9061,
+  "asm.strlen": 14589,
+  "asm.utf8_count": 859,
   "codegen.vaadd_vv": 509657,
   "codegen.vaadd_vx": 1082497,
   "codegen.vaaddu_vv": 509657,
@@ -499,30 +499,30 @@
   "codegen.vxor_vx": 272297,
   "codegen.vzext_vf2": 35727,
   "codegen.vzext_vf4": 9179,
-  "intrinsic.conv2d_less_m2": 4359,
-  "intrinsic.linear_normalization": 4818,
+  "intrinsic.conv2d_less_m2": 4684,
+  "intrinsic.linear_normalization": 5505,
   "intrinsic.matmul": 156950,
-  "intrinsic.softmax": 9003,
-  "mlir.axpy_masked": 11636,
-  "mlir.conv": 298996,
-  "mlir.hello": 371,
-  "mlir.matmul": 75518,
-  "mlir.maxvl_tail_setvl_front": 2375,
-  "mlir.rvv_vp_intrinsic_add": 774,
-  "mlir.rvv_vp_intrinsic_add_scalable": 1173,
-  "mlir.stripmining": 35059,
-  "mlir.vectoradd": 72377,
-  "pytorch.demo": 122735,
-  "pytorch.matmul": 109796,
-  "rvv_bench.ascii_to_utf16": 1383013,
-  "rvv_bench.ascii_to_utf32": 440566,
-  "rvv_bench.byteswap": 449395,
-  "rvv_bench.chacha20": 45624,
-  "rvv_bench.mandelbrot": 563959,
-  "rvv_bench.memcpy": 1267552,
-  "rvv_bench.memset": 510029,
-  "rvv_bench.mergelines": 777753,
-  "rvv_bench.poly1305": 45624,
-  "rvv_bench.strlen": 494533,
-  "rvv_bench.utf8_count": 2659715
+  "intrinsic.softmax": 9329,
+  "mlir.axpy_masked": 11962,
+  "mlir.conv": 299284,
+  "mlir.hello": 694,
+  "mlir.matmul": 75963,
+  "mlir.maxvl_tail_setvl_front": 2762,
+  "mlir.rvv_vp_intrinsic_add": 1127,
+  "mlir.rvv_vp_intrinsic_add_scalable": 1539,
+  "mlir.stripmining": 35418,
+  "mlir.vectoradd": 72384,
+  "pytorch.demo": 123874,
+  "pytorch.matmul": 109914,
+  "rvv_bench.ascii_to_utf16": 1383170,
+  "rvv_bench.ascii_to_utf32": 440610,
+  "rvv_bench.byteswap": 449384,
+  "rvv_bench.chacha20": 45889,
+  "rvv_bench.mandelbrot": 564338,
+  "rvv_bench.memcpy": 1268232,
+  "rvv_bench.memset": 509712,
+  "rvv_bench.mergelines": 770459,
+  "rvv_bench.poly1305": 45889,
+  "rvv_bench.strlen": 495094,
+  "rvv_bench.utf8_count": 2669518
 }
\ No newline at end of file
diff --git a/.github/designs/rookidee/t1rocketemu.json b/.github/designs/rookidee/t1rocketemu.json
index 7e8005b9b..9d6b93797 100644
--- a/.github/designs/rookidee/t1rocketemu.json
+++ b/.github/designs/rookidee/t1rocketemu.json
@@ -1,6 +1,6 @@
 {
-  "asm.mmm": 57732,
-  "asm.smoke": 8647,
+  "asm.mmm": 58062,
+  "asm.smoke": 9136,
   "codegen.vaadd_vv": 306399,
   "codegen.vaadd_vx": 819226,
   "codegen.vaaddu_vv": 306399,
@@ -430,19 +430,19 @@
   "codegen.vxor_vx": 214202,
   "codegen.vzext_vf2": 29723,
   "codegen.vzext_vf4": 11289,
-  "intrinsic.conv2d_less_m2": 4410,
-  "mlir.hello": 369,
-  "mlir.rvv_vp_intrinsic_add": 762,
-  "mlir.rvv_vp_intrinsic_add_scalable": 1062,
-  "mlir.stripmining": 58612,
-  "rvv_bench.ascii_to_utf16": 1385284,
-  "rvv_bench.ascii_to_utf32": 431848,
-  "rvv_bench.byteswap": 489626,
-  "rvv_bench.chacha20": 45624,
-  "rvv_bench.memcpy": 1285481,
-  "rvv_bench.memset": 509966,
-  "rvv_bench.mergelines": 795847,
-  "rvv_bench.poly1305": 45624,
-  "rvv_bench.strlen": 529066,
-  "rvv_bench.utf8_count": 2753737
+  "intrinsic.conv2d_less_m2": 4728,
+  "mlir.hello": 692,
+  "mlir.rvv_vp_intrinsic_add": 1122,
+  "mlir.rvv_vp_intrinsic_add_scalable": 1428,
+  "mlir.stripmining": 58928,
+  "rvv_bench.ascii_to_utf16": 1386186,
+  "rvv_bench.ascii_to_utf32": 431879,
+  "rvv_bench.byteswap": 489489,
+  "rvv_bench.chacha20": 45889,
+  "rvv_bench.memcpy": 1285677,
+  "rvv_bench.memset": 510014,
+  "rvv_bench.mergelines": 790052,
+  "rvv_bench.poly1305": 45889,
+  "rvv_bench.strlen": 530552,
+  "rvv_bench.utf8_count": 2762239
 }
\ No newline at end of file
diff --git a/nix/overlay.nix b/nix/overlay.nix
index 7bfe75237..4a3b3fe54 100644
--- a/nix/overlay.nix
+++ b/nix/overlay.nix
@@ -151,4 +151,30 @@ rec {
   };
 
   t1 = final.callPackage ./t1 { };
+
+  buddy-codegen = final.callPackage ./pkgs/buddy-codegen { };
+
+  spike-t1 = final.writeShellApplication {
+    name = "spike-t1";
+
+    runtimeInputs = with final; [
+      pkgsCross.riscv32-embedded.buildPackages.gcc
+      spike
+      dtc
+    ];
+
+    text = ''
+      elf=''${1:-}
+      if [[ -z "$elf" ]]; then
+        echo "Require argument to find elf" >&2
+        exit 1
+      fi
+
+      spike -d --isa=rv32gcv_zvl2048b_zve32f \
+        --priv=m \
+        -m0x20000000:0x20000000,0x00000000:0x20000000,0x40000000:0x80000000,0xc0000000:0x40000000 \
+        --pc=0x"$(grep '<_start>' <(riscv32-none-elf-objdump -d "$elf") | cut -d' ' -f1)" \
+        "$elf"
+    '';
+  };
 }
diff --git a/nix/patches/buddy-mlir/00-fix-splat-op.patch b/nix/patches/buddy-mlir/00-fix-splat-op.patch
new file mode 100644
index 000000000..3fd32476c
--- /dev/null
+++ b/nix/patches/buddy-mlir/00-fix-splat-op.patch
@@ -0,0 +1,59 @@
+--- a/frontend/Python/ops/linalg.py	1970-01-01 08:00:01.000000000 +0800
++++ b/frontend/Python/ops/linalg.py	1970-01-01 08:00:01.000000000 +0800
+@@ -1166,7 +1166,7 @@
+     element = mlir_element_attr_get(dtype, 0.0)
+     attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+     matmul_result_buffer = arith.ConstantOp(tensor_type, attr).result
+-    op = linalg.matmul(input1, input2, outs=[matmul_result_buffer])
++    op = linalg.matmul(input1, input2, outputs=[matmul_result_buffer])
+     return op
+ 
+ 
+@@ -1186,7 +1186,7 @@
+     element = mlir_element_attr_get(dtype, 0.0)
+     attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+     result_buffer = arith.ConstantOp(tensor_type, attr).result
+-    op = linalg.matmul_transpose_b(input1, input2, outs=[result_buffer])
++    op = linalg.matmul_transpose_b(input1, input2, outputs=[result_buffer])
+     return op
+ 
+ 
+@@ -1854,9 +1854,9 @@
+     output = tensor.EmptyOp(output_shape, mlir_dtype)
+ 
+     if not isinstance(input2.type, ir.RankedTensorType):
+-        input2 = tensor.SplatOp(tensor_type, input2).result
++        input2 = tensor.SplatOp(tensor_type, input2, []).result
+     if not isinstance(input3.type, ir.RankedTensorType):
+-        input3 = tensor.SplatOp(tensor_type, input3).result
++        input3 = tensor.SplatOp(tensor_type, input3, []).result
+ 
+     generic_map = ir.AffineMap.get_permutation(
+         [i for i in range(len(output_shape))]
+@@ -2038,7 +2038,7 @@
+     input_shape = ir.RankedTensorType(input_tensor.type).shape
+     tensor_type = ir.RankedTensorType.get(input_shape, input_dtype)
+     scalar = arith.ConstantOp(input_dtype, node.args[1])
+-    rhs = tensor.SplatOp(tensor_type, scalar)
++    rhs = tensor.SplatOp(tensor_type, scalar, [])
+     if str(input_dtype).find("i") != -1:
+         cmp_op = arith.CmpIOp(4, input_tensor, rhs)
+     else:
+@@ -2069,7 +2069,7 @@
+     tensor_type = ir.RankedTensorType.get(input_shape, input_dtype)
+ 
+     scalar = arith.ConstantOp(input_dtype, node.args[1])
+-    rhs = tensor.SplatOp(tensor_type, scalar)
++    rhs = tensor.SplatOp(tensor_type, scalar, [])
+ 
+     if str(input_dtype).find("i") != -1:
+         cmp_op = arith.CmpIOp(5, input_tensor, rhs)
+@@ -2390,7 +2390,7 @@
+         scalar = arith.ConstantOp(input_dtype, float(node.args[1]))
+     else:
+         scalar = arith.ConstantOp(input_dtype, node.args[1])
+-    rhs = tensor.SplatOp(tensor_type, scalar)
++    rhs = tensor.SplatOp(tensor_type, scalar, [])
+     if str(input_dtype).find("i") != -1:
+         cmp_op = arith.CmpIOp(0, input_tensor, rhs)
+     else:
diff --git a/nix/patches/llvm/fix-vector-convert.patch b/nix/patches/llvm/fix-vector-convert.patch
new file mode 100644
index 000000000..9be7dc4a6
--- /dev/null
+++ b/nix/patches/llvm/fix-vector-convert.patch
@@ -0,0 +1,100 @@
+--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp	1970-01-01 00:00:01.000000000 +0000
++++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp	2025-04-08 19:56:05.027381997 +0000
+@@ -4512,7 +4512,8 @@
+ 
+   // Src needs to have twice the number of elements.
+   unsigned NumElts = VT.getVectorNumElements();
+-  if (Src.getValueType().getVectorNumElements() != (NumElts * 2))
++  if (!Src.getValueType().isFixedLengthVector() ||
++      Src.getValueType().getVectorNumElements() != (NumElts * 2))
+     return SDValue();
+ 
+   // The extracts must extract the two halves of the source.
+--- a/mlir/include/mlir/Conversion/Passes.td	1970-01-01 00:00:01.000000000 +0000
++++ b/mlir/include/mlir/Conversion/Passes.td	2025-04-08 18:58:09.826017766 +0000
+@@ -1444,6 +1444,9 @@
+            "vector::VectorTransformsOptions",
+            /*default=*/"vector::VectorTransformsOptions()",
+            "Options to lower some operations like contractions and transposes.">,
++    Option<"indexBitwidth", "index-bitwidth", "unsigned",
++           /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
++           "Bitwidth of the index type, 0 to use size of machine word">,
+   ];
+ }
+ 
+--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp	1970-01-01 00:00:01.000000000 +0000
++++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp	2025-04-08 18:59:47.622433399 +0000
+@@ -1438,8 +1438,6 @@
+     if (llvm::any_of(*targetStrides, ShapedType::isDynamic))
+       return failure();
+ 
+-    auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
+-
+     // Create descriptor.
+     auto desc = MemRefDescriptor::undef(rewriter, loc, llvmTargetDescriptorTy);
+     // Set allocated ptr.
+@@ -1450,21 +1448,23 @@
+     Value ptr = sourceMemRef.alignedPtr(rewriter, loc);
+     desc.setAlignedPtr(rewriter, loc, ptr);
+     // Fill offset 0.
+-    auto attr = rewriter.getIntegerAttr(rewriter.getIndexType(), 0);
+-    auto zero = rewriter.create<LLVM::ConstantOp>(loc, int64Ty, attr);
++    auto idxType = rewriter.getIndexType();
++    auto zero = rewriter.create<LLVM::ConstantOp>(
++        loc, typeConverter->convertType(idxType),
++        rewriter.getIntegerAttr(idxType, 0));
+     desc.setOffset(rewriter, loc, zero);
+ 
+     // Fill size and stride descriptors in memref.
+     for (const auto &indexedSize :
+          llvm::enumerate(targetMemRefType.getShape())) {
+       int64_t index = indexedSize.index();
+-      auto sizeAttr =
+-          rewriter.getIntegerAttr(rewriter.getIndexType(), indexedSize.value());
+-      auto size = rewriter.create<LLVM::ConstantOp>(loc, int64Ty, sizeAttr);
++      auto size = rewriter.create<LLVM::ConstantOp>(
++          loc, typeConverter->convertType(idxType),
++          rewriter.getIntegerAttr(idxType, indexedSize.value()));
+       desc.setSize(rewriter, loc, index, size);
+-      auto strideAttr = rewriter.getIntegerAttr(rewriter.getIndexType(),
+-                                                (*targetStrides)[index]);
+-      auto stride = rewriter.create<LLVM::ConstantOp>(loc, int64Ty, strideAttr);
++      auto stride = rewriter.create<LLVM::ConstantOp>(
++          loc, typeConverter->convertType(idxType),
++          rewriter.getIntegerAttr(idxType, (*targetStrides)[index]));
+       desc.setStride(rewriter, loc, index, stride);
+     }
+ 
+--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp	1970-01-01 00:00:01.000000000 +0000
++++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp	2025-04-08 19:00:55.000728446 +0000
+@@ -8,6 +8,7 @@
+ 
+ #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h"
+ 
++#include "mlir/Analysis/DataLayoutAnalysis.h"
+ #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+ #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+ #include "mlir/Dialect/AMX/AMXDialect.h"
+@@ -64,6 +65,8 @@
+   // Perform progressive lowering of operations on slices and all contraction
+   // operations. Also materializes masks, lowers vector.step, rank-reduces FMA,
+   // applies folding and DCE.
++  Operation *op = getOperation();
++  const auto &dataLayoutAnalysis = getAnalysis<DataLayoutAnalysis>();
+   {
+     RewritePatternSet patterns(&getContext());
+     populateVectorToVectorCanonicalizationPatterns(patterns);
+@@ -85,8 +88,11 @@
+   }
+ 
+   // Convert to the LLVM IR dialect.
+-  LowerToLLVMOptions options(&getContext());
+-  LLVMTypeConverter converter(&getContext(), options);
++  LowerToLLVMOptions options(&getContext(),
++                             dataLayoutAnalysis.getAtOrAbove(op));
++  if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
++    options.overrideIndexBitwidth(indexBitwidth);
++  LLVMTypeConverter converter(&getContext(), options, &dataLayoutAnalysis);
+   RewritePatternSet patterns(&getContext());
+   populateVectorTransferLoweringPatterns(patterns);
+   populateVectorToLLVMMatrixConversionPatterns(converter, patterns);
diff --git a/nix/pkgs/buddy-codegen/buddy-codegen.cc b/nix/pkgs/buddy-codegen/buddy-codegen.cc
new file mode 100644
index 000000000..c140f81b1
--- /dev/null
+++ b/nix/pkgs/buddy-codegen/buddy-codegen.cc
@@ -0,0 +1,128 @@
+#include <argparse/argparse.hpp>
+#include <buddy/Core/Container.h>
+#include <buddy/DIP/DIP.h>
+#include <buddy/DIP/ImgContainer.h>
+#include <filesystem>
+#include <fstream>
+#include <iomanip> // For precision control
+#include <sstream>
+#include <string>
+
+template <typename T>
+std::string generate_c_array_code(T *data, size_t size,
+                                  const std::string &declare,
+                                  const std::string &array_name) {
+  std::ostringstream oss;
+  oss << declare << " " << array_name << "[" << size << "] = {\n    ";
+  for (size_t i = 0; i < size; ++i) {
+    oss << std::fixed << std::setprecision(6) << data[i];
+    if (i != size - 1)
+      oss << ", ";
+    if ((i + 1) % 10 == 0)
+      oss << "\n    ";
+  }
+
+  oss << "\n};\n";
+  return oss.str();
+}
+
+int main(int argc, char *argv[]) {
+  argparse::ArgumentParser program("buddy-codegen");
+
+  argparse::ArgumentParser img_cmd("img");
+  img_cmd.add_description("Convert image to MLIR memref C code");
+  img_cmd.add_argument("-i", "--input")
+      .required()
+      .help("specify the input file");
+  img_cmd.add_argument("-o", "--output")
+      .required()
+      .help("specify the output file.");
+  // TODO: support other format
+  img_cmd.add_argument("-m", "--image-mode").default_value("rgb");
+
+  argparse::ArgumentParser arg_cmd("arg");
+  arg_cmd.add_description("Convert PyTorch parameter to MLIR memref C code");
+  arg_cmd.add_argument("-i", "--input")
+      .required()
+      .help("specify the input file");
+  arg_cmd.add_argument("-o", "--output")
+      .required()
+      .help("specify the output file.");
+  arg_cmd.add_argument("-s", "--size")
+      .scan<'d', size_t>()
+      .required()
+      .help("specify the parameter size.");
+
+  program.add_subparser(img_cmd);
+  program.add_subparser(arg_cmd);
+
+  try {
+    program.parse_args(argc, argv);
+  } catch (const std::exception &err) {
+    std::cerr << err.what() << std::endl;
+    std::cerr << program;
+    std::exit(1);
+  }
+
+  if (program.is_subcommand_used(img_cmd)) {
+    std::string imgPath = img_cmd.get("input");
+    std::ofstream outPath;
+    outPath.open(img_cmd.get("output"));
+
+    std::cout << "[buddy-codegen] Loading image..." << std::endl;
+    dip::Image<float, 4> input(imgPath, dip::DIP_RGB, true);
+    // TODO read the col, row using PNG lib
+    MemRef<float, 4> inputResize = dip::Resize4D_NCHW(
+        &input, dip::INTERPOLATION_TYPE::BILINEAR_INTERPOLATION,
+        {1, 3, 224, 224} /*{image_cols, image_rows}*/);
+    auto sizes = inputResize.getSizes();
+    outPath << "#include <cstdint>" << std::endl;
+    outPath << generate_c_array_code(sizes, 4, "static const int32_t",
+                                     "IMAGE_SIZES")
+            << std::endl;
+    outPath << generate_c_array_code(
+                   inputResize.getData(), inputResize.getSize(),
+                   "__attribute((section(\".vdata\"))) float", "IMAGE")
+            << std::endl;
+    outPath.close();
+    std::cout << "[buddy-codegen] code generated" << std::endl;
+  } else if (program.is_subcommand_used(arg_cmd)) {
+    std::string argPath = arg_cmd.get("input");
+    std::ofstream outPath;
+    outPath.open(arg_cmd.get("output"));
+    auto params_size = arg_cmd.get<size_t>("size");
+
+    MemRef<float, 1> params({params_size});
+
+    const auto loadStart = std::chrono::high_resolution_clock::now();
+    // Open the parameter file in binary mode.
+    std::ifstream paramFile(argPath, std::ios::in | std::ios::binary);
+    if (!paramFile.is_open()) {
+      throw std::runtime_error("[Error] Failed to open params file!");
+    }
+    std::cout << "[buddy-codegen] Loading params..." << std::endl;
+    // Read the parameter data into the provided memory reference.
+    paramFile.read(reinterpret_cast<char *>(params.getData()),
+                   sizeof(float) * (params.getSize()));
+    if (paramFile.fail()) {
+      throw std::runtime_error("Error occurred while reading params file!");
+    }
+    paramFile.close();
+
+    outPath << "#include <cstdint>" << std::endl;
+    outPath << generate_c_array_code(params.getSizes(), 1,
+                                     "static const int32_t", "PARAMS_SIZES")
+            << std::endl;
+    outPath << generate_c_array_code(params.getData(), params.getSize(),
+                                     "__attribute((section(\".vdata\"))) float",
+                                     "PARAMS")
+            << std::endl;
+    outPath.close();
+
+    const auto loadEnd = std::chrono::high_resolution_clock::now();
+    const std::chrono::duration<double, std::milli> loadTime =
+        loadEnd - loadStart;
+    std::cout << "[buddy-codegen] Params load time: "
+              << (double)(loadTime.count()) / 1000 << "s" << std::endl;
+  }
+}
diff --git a/nix/pkgs/buddy-codegen/default.nix b/nix/pkgs/buddy-codegen/default.nix
new file mode 100644
index 000000000..caca78327
--- /dev/null
+++ b/nix/pkgs/buddy-codegen/default.nix
@@ -0,0 +1,63 @@
+{
+  lib,
+  stdenv,
+  argparse,
+  buddy-mlir,
+  libpng,
+}:
+stdenv.mkDerivation {
+  name = "buddy-codegen";
+
+  src =
+    with lib.fileset;
+    toSource {
+      fileset = unions [
+        ./dip.mlir
+        ./buddy-codegen.cc
+      ];
+      root = ./.;
+    };
+
+  buildInputs = [
+    libpng
+    argparse
+    buddy-mlir
+  ];
+
+  env.NIX_CFLAGS_COMPILE = toString [
+    # TODO: BMP is now broken
+    "-lpng"
+    "-DBUDDY_ENABLE_PNG"
+    "-O3"
+  ];
+
+  buildPhase = ''
+    runHook preBuild
+    # We don't need to care about stripmining size here
+    buddy-opt dip.mlir \
+      -lower-dip="DIP-strip-mining=256" \
+      -arith-expand \
+      -lower-affine \
+      -llvm-request-c-wrappers \
+      -convert-scf-to-cf \
+      -convert-math-to-llvm \
+      -convert-vector-to-llvm \
+      -finalize-memref-to-llvm \
+      -convert-func-to-llvm \
+      -reconcile-unrealized-casts | \
+    buddy-translate --mlir-to-llvmir | \
+    buddy-llc \
+        --filetype=obj \
+        -o dip.o
+
+    $CXX ./dip.o ./buddy-codegen.cc -o buddy-codegen
+    runHook postBuild
+  '';
+
+  installPhase = ''
+    runHook preInstall
+    mkdir -p $out/bin
+    cp -v buddy-codegen $out/bin/
+    runHook postInstall
+  '';
+}
diff --git a/nix/pkgs/buddy-codegen/dip.mlir b/nix/pkgs/buddy-codegen/dip.mlir
new file mode 100644
index 000000000..44a0c194d
--- /dev/null
+++ b/nix/pkgs/buddy-codegen/dip.mlir
@@ -0,0 +1,30 @@
+//===- DIP.mlir -----------------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides DIP dialect functions.
+//
+//===----------------------------------------------------------------------===//
+func.func @resize_4d_nchw_nearest_neighbour_interpolation(%inputImage : memref<?x?x?x?xf32>, %horizontal_scaling_factor : f32, %vertical_scaling_factor : f32, %outputImage : memref<?x?x?x?xf32>) attributes{llvm.emit_c_interface}
+{
+  dip.resize_4d_nchw NEAREST_NEIGHBOUR_INTERPOLATION %inputImage, %horizontal_scaling_factor, %vertical_scaling_factor, %outputImage : memref<?x?x?x?xf32>, f32, f32, memref<?x?x?x?xf32>
+  return
+}
+
+func.func @resize_4d_nchw_bilinear_interpolation(%inputImage : memref<?x?x?x?xf32>, %horizontal_scaling_factor : f32, %vertical_scaling_factor : f32, %outputImage : memref<?x?x?x?xf32>) attributes{llvm.emit_c_interface}
+{
+  dip.resize_4d_nchw BILINEAR_INTERPOLATION %inputImage, %horizontal_scaling_factor, %vertical_scaling_factor, %outputImage : memref<?x?x?x?xf32>, f32, f32, memref<?x?x?x?xf32>
+  return
+}
diff --git a/nix/pkgs/buddy-llvm.nix b/nix/pkgs/buddy-llvm.nix
index 690a7fe62..b04ce38bb 100644
--- a/nix/pkgs/buddy-llvm.nix
+++ b/nix/pkgs/buddy-llvm.nix
@@ -12,20 +12,25 @@ let
     ps.pybind11
     ps.pyyaml
     ps.ml-dtypes
+    ps.nanobind
   ]);
 in
 stdenv.mkDerivation rec {
   name = "llvm-for-buddy-mlir";
-  version = "6c59f0e1b0fb56c909ad7c9aad4bde37dc006ae0";
+  version = "3bd3e06f3fe418e24af65457877f40cee0544f9d";
   src = fetchFromGitHub {
     owner = "llvm";
     repo = "llvm-project";
     rev = version;
-    hash = "sha256-bMJJ2q1hSh7m0ewclHOmIe7lOHv110rz/P7D3pw8Uiw=";
+    hash = "sha256-JSquIeA14dXKXO6E8v0HV36fA/+bZypJKkcGMvPxxHI=";
   };
 
   requiredSystemFeatures = [ "big-parallel" ];
 
+  patches = [
+    ../patches/llvm/fix-vector-convert.patch
+  ];
+
   propagatedBuildInputs = [
     pythonEnv
   ];
@@ -66,6 +71,7 @@ stdenv.mkDerivation rec {
 
     # move all lib files to $lib except lib/cmake
     moveToOutput "lib" "$lib"
+    moveToOutput "python_packages" "$lib"
     moveToOutput "lib/cmake" "$dev"
     moveToOutput "src" "$dev"
 
diff --git a/nix/pkgs/buddy-mlir.nix b/nix/pkgs/buddy-mlir.nix
index 6cb60c8d7..2491b0beb 100644
--- a/nix/pkgs/buddy-mlir.nix
+++ b/nix/pkgs/buddy-mlir.nix
@@ -1,4 +1,5 @@
 {
+  lib,
   cmake,
   ninja,
   llvmPackages_17,
@@ -17,12 +18,22 @@ let
     version = "unstable-2024-07-18";
 
     src = fetchFromGitHub {
-      owner = "buddy-compiler";
+      owner = "WuXintong123";
       repo = "buddy-mlir";
-      rev = "c57584a0e3c38e938a3902320f62b202ced84996";
-      hash = "sha256-IBsShnkaA0qPkEMbkkSjUMWXnDGW/CrTeiSSLLttlXk=";
+      rev = "6586555adf921371906fe908293714bff4d92b24";
+      hash = "sha256-NDdj72oNhIKcU7cOw+RDzPrjKLIUVY63TDUrJ2DzYL0=";
     };
 
+    patches = [
+      ../patches/buddy-mlir/00-fix-splat-op.patch
+    ];
+
+    postPatch = ''
+      sed -i \
+        's|link_directories(''${LLVM_BINARY_DIR}/tools/mlir/|link_directories(''${LLVM_BINARY_DIR}/|' \
+        midend/python/CMakeLists.txt
+    '';
+
     nativeBuildInputs = [
       cmake
       ninja
@@ -33,8 +44,8 @@ let
     ];
 
     cmakeFlags = [
-      "-DMLIR_DIR=${buddy-llvm.dev}/lib/cmake/mlir"
-      "-DLLVM_DIR=${buddy-llvm.dev}/lib/cmake/llvm"
+      "-DMLIR_DIR=${buddy-llvm}/lib/cmake/mlir"
+      "-DLLVM_DIR=${buddy-llvm}/lib/cmake/llvm"
       "-DLLVM_MAIN_SRC_DIR=${buddy-llvm.src}/llvm"
       "-DBUDDY_MLIR_ENABLE_PYTHON_PACKAGES=ON"
       "-DCMAKE_BUILD_TYPE=Release"
@@ -43,11 +54,17 @@ let
     # No need to do check, and it also takes too much time to finish.
     doCheck = false;
 
+    # TODO: Upstream this to Buddy-MLIR cmake install
+    postInstall = ''
+      mkdir -p "$out/include"
+      cp -vr "$NIX_BUILD_TOP/$sourceRoot/frontend/Interfaces/buddy" "$out/include"
+    '';
+
     # Here we concatenate the LLVM and Buddy python module into one directory for easier import
     postFixup = ''
       mkdir -p $out/lib/python${python3.pythonVersion}/site-packages
       cp -vr $out/python_packages/buddy $out/lib/python${python3.pythonVersion}/site-packages/
-      cp -vr ${buddy-llvm}/python_packages/mlir_core/mlir $out/lib/python${python3.pythonVersion}/site-packages/
+      cp -vr ${buddy-llvm.lib}/python_packages/mlir_core/mlir $out/lib/python${python3.pythonVersion}/site-packages/
     '';
 
     passthru = {
@@ -69,6 +86,7 @@ let
         # tinyllama
         ps.transformers
         ps.accelerate
+        ps.sentencepiece
       ]);
     };
   };
diff --git a/rocketv/src/RocketCore.scala b/rocketv/src/RocketCore.scala
index 25eaf8d26..d144ed139 100644
--- a/rocketv/src/RocketCore.scala
+++ b/rocketv/src/RocketCore.scala
@@ -1609,8 +1609,8 @@ class Rocket(val parameter: RocketParameter)
       t1XRDRetireQueue.deq.ready := (!(wbWxd || (dmemResponseReplay && dmemResponseXpu)) || !vectorTryToWriteRd) && (!(dmemResponseReplay && dmemResponseFpu) || !vectorTryToWriteFP)
 
       when(t1XRDRetireQueue.deq.fire && vectorTryToWriteRd) {
-        longlatencyWdata    := t1.retire.rd.bits.rdData
-        longlatencyWaddress := t1.retire.rd.bits.rdAddress
+        longlatencyWdata    := t1XRDRetireQueue.deq.bits.rdData
+        longlatencyWaddress := t1XRDRetireQueue.deq.bits.rdAddress
         longLatencyWenable  := true.B
       }
       io.fpu.foreach { fpu =>
diff --git a/t1/src/T1.scala b/t1/src/T1.scala
index 88fc63c5f..82ebe2c80 100644
--- a/t1/src/T1.scala
+++ b/t1/src/T1.scala
@@ -684,10 +684,8 @@ class T1(val parameter: T1Parameter)
   val completeIndexInstruction: Bool =
     ohCheck(lsu.lastReport, slots.last.record.instructionIndex, parameter.chainingSize) && !slots.last.state.idle
 
-  val freeOR: Bool = VecInit(slots.map(_.state.idle)).asUInt.orR
-
   /** slot is ready to accept new instructions. */
-  val slotReady: Bool = Mux(specialInstruction, slots.map(_.state.idle).last, freeOR)
+  val slotReady: Bool = VecInit(slots.map(_.state.idle)).asUInt.andR
 
   val olderCheck: Bool = slots.map { re =>
     // The same lsb will make it difficult to distinguish between the new and the old
diff --git a/tests/builder.nix b/tests/builder.nix
index 3b1a9b9ae..aeffd6929 100644
--- a/tests/builder.nix
+++ b/tests/builder.nix
@@ -44,6 +44,9 @@ let
           "-static"
           "-mcmodel=medany"
           "-fvisibility=hidden"
+          "-fno-exceptions"
+          "-fno-rtti"
+          "-fno-threadsafe-statics"
           "-fno-PIC"
           "-g"
           "-O3"
diff --git a/tests/intrinsic/conv2d_resnet/conv2d_resnet.c b/tests/intrinsic/conv2d_resnet/conv2d_resnet.c
new file mode 100644
index 000000000..2f7ed6200
--- /dev/null
+++ b/tests/intrinsic/conv2d_resnet/conv2d_resnet.c
@@ -0,0 +1,80 @@
+#include <riscv_vector.h>
+#include <stddef.h>
+#include <stdint.h>
+
+typedef int32_t vl_type;
+
+// Adapt from conv2d_less_m2, add outChannel and inChannel
+//
+// when AVL >= MAXVL, this is efficent
+void conv2d(int32_t *restrict output_, int32_t const *restrict img_,
+            int32_t const *restrict kernel_, size_t imgRow, size_t imgCol,
+            size_t outChannel, size_t inChannel, size_t kernelSize) {
+
+  size_t const outRow = imgRow - kernelSize + 1;
+  size_t const outCol = imgCol - kernelSize + 1;
+
+  for (size_t coI = 0; coI < outChannel; coI++) {
+    for (size_t ciI = 0; ciI < inChannel; ciI++) {
+      int32_t *output = output_ + coI * (outRow * outCol);
+      int32_t const *img = img_ + ciI * (imgRow * imgCol);
+      int32_t const *kernel = kernel_ + (coI * inChannel + ciI) * kernelSize * kernelSize;
+
+      for (size_t iI = 0; iI < imgRow; iI++) {
+        for (size_t kI = 0; kI < kernelSize; kI++) {
+          // only need img[kI] to img[imgRow + kI - kernelSize]
+          if (!(kI <= iI && iI < kI + outCol))
+            continue;
+  
+          for (size_t kJ = 0; kJ < kernelSize; kJ++) {
+            int32_t const K = kernel[kI * kernelSize + kJ];
+            // from img[iI][kJ] to img[iI][imgCol - kernelSize], step by 1
+            // imgCol - kernelSize + 1 is the number of elements to be processed
+            int32_t const *imgPtr = img + iI * imgCol + kJ;
+            // from output[iI - kI][0] to output[iI - kI][imgCol - kernelSize],
+            int32_t *outPtr = output + (iI - kI) * outCol;
+            // when AVL >= MAXVL, this is efficent
+            size_t avl = imgCol - kernelSize + 1;
+            while (avl > 0) {
+              // TODO: exchange vl-loop and kJ loop, can be more cache friendly
+  
+              size_t vl = __riscv_vsetvl_e32m2(avl);
+  
+              vint32m2_t imgVec = __riscv_vle32_v_i32m2(imgPtr, vl);
+              vint32m2_t mulVec = __riscv_vmul_vx_i32m2(imgVec, K, vl);
+  
+              vint32m2_t outVec = __riscv_vle32_v_i32m2(outPtr, vl);
+              vint32m2_t resVec = __riscv_vadd_vv_i32m2(outVec, mulVec, vl);
+  
+              __riscv_vse32_v_i32m2(outPtr, resVec, vl);
+  
+              avl -= vl;
+              imgPtr += vl;
+              outPtr += vl;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+#define VDATA __attribute((section(".vdata")))
+#define VBASS __attribute((section(".vbss")))
+
+// size of image is padded to (H+K-1, W+K-1)
+
+// CONV1: O[224, 224, 64], W[64, 3, 7, 7]
+VDATA int32_t conv1_img[64*230*230];
+VDATA int32_t conv1_output[64*224*224];
+VDATA int32_t conv1_kernel[64*3*7*7];
+
+// CONV2: ...
+
+int test() {
+  conv2d(conv1_output, conv1_img, conv1_kernel, 224, 224, 64, 3, 7);
+
+  // conv2d(...)
+
+  return 0;
+}
diff --git a/tests/pytorch/default.nix b/tests/pytorch/default.nix
index 44a4c09ea..a8c9867cb 100644
--- a/tests/pytorch/default.nix
+++ b/tests/pytorch/default.nix
@@ -25,6 +25,7 @@ let
             nativeBuildInputs = [
               buddy-mlir.pyenv
               buddy-mlir
+              buddy-mlir.llvm
             ];
 
             src = sourcePath;
@@ -41,7 +42,7 @@ let
 
               for mlir in ''${optArtifacts[@]}; do
                 echo "Translating $mlir"
-                buddy-translate --buddy-to-llvmir "$mlir" -o "$mlir.ll"
+                mlir-translate --mlir-to-llvmir "$mlir" -o "$mlir.ll"
 
                 translateArtifacts+=("$mlir.ll")
               done
@@ -55,10 +56,10 @@ let
 
               for llvmir in ''${translateArtifacts[@]}; do
                 echo "Compiling $llvmir"
-                buddy-llc "$llvmir" \
+                llc "$llvmir" \
                   -mtriple=riscv32 \
                   -target-abi=ilp32f \
-                  -mattr=+m,+f,+zve32f \
+                  -mattr=+m,+f,+zvl4096b,+zve32f \
                   --filetype=obj \
                   -o "$llvmir.o"
 
diff --git a/tests/pytorch/include/buddy/Core/Container.h b/tests/pytorch/include/buddy/Core/Container.h
new file mode 100644
index 000000000..9f6229c19
--- /dev/null
+++ b/tests/pytorch/include/buddy/Core/Container.h
@@ -0,0 +1,144 @@
+//===- Container.h --------------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// Container descriptor.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+// This is vendored version of buddy-mlir Container.h file at revision
+// c57584a0e3c38e938a3902320f62b202ced84996. Modified for T1 embedded test env.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <vector>
+
+// MemRef descriptor.
+// - T represents the type of the elements.
+// - N represents the number of dimensions.
+// - The storage order is NCHW.
+template <typename T, size_t N> class MemRef {
+public:
+  // Construct using init to allocated area
+  constexpr MemRef(T *allocated, T init, intptr_t sizes[N]);
+  constexpr MemRef(T *allocated, intptr_t sizes[N], intptr_t offset = 0);
+  // Get the data pointer.
+  T *getData();
+  // Get the sizes (shape).
+  const intptr_t *getSizes() { return sizes; }
+  // Get the strides.
+  const intptr_t *getStrides() { return strides; }
+  // Get the rank of the memref.
+  size_t getRank() const { return N; }
+  // Get the size (number of elements).
+  size_t getSize() const { return product(this->sizes); }
+  // Get the element at index.
+  const T &operator[](size_t index) const;
+  T &operator[](size_t index);
+
+protected:
+  // Set the strides.
+  // Computes the strides of the transposed tensor for transpose=true.
+  inline void setStrides();
+  // Compute the product of array elements.
+  inline size_t product(const intptr_t sizes[N]) const;
+
+  // Data.
+  // The `aligned` and `allocated` members point to the same address, `aligned`
+  // member is responsible for handling data, and `allocated` member is
+  // resposible for handling the memory space.
+  T *allocated = nullptr;
+  T *aligned = nullptr;
+  // Offset.
+  intptr_t offset = 0;
+  // Shape.
+  intptr_t sizes[N];
+  // Strides.
+  intptr_t strides[N];
+};
+
+template <typename T, std::size_t N>
+constexpr MemRef<T, N>::MemRef(T *allocated, T init, intptr_t sizes[N])
+    : MemRef(allocated, sizes) {
+  size_t size = product(sizes);
+  std::fill(aligned, aligned + size, init);
+}
+
+// MemRef Array Constructor.
+// Construct a MemRef object from the data pointer, sizes, and offset.
+// The default offset is 0.
+template <typename T, std::size_t N>
+constexpr MemRef<T, N>::MemRef(T *data, intptr_t sizes[N], intptr_t offset) {
+  this->offset = offset;
+  for (size_t i = 0; i < N; i++) {
+    this->sizes[i] = sizes[i];
+  }
+  setStrides();
+  size_t size = product(sizes);
+  allocated = data;
+  aligned = allocated;
+  for (size_t i = 0; i < size; i++) {
+    aligned[i] = data[i];
+  }
+}
+
+// Get the data pointer.
+// Return the `aligned` pointer if the container data size is greater than zero.
+// If the data size is negative or zero, which means no space is allocated for
+// the container data pointer, the function does not allow to return the data
+// pointer.
+template <typename T, std::size_t N> T *MemRef<T, N>::getData() {
+  size_t size = product(this->sizes);
+  return aligned;
+}
+
+// Get the element at index.
+// Return the specific element if the container data size is greater than zero.
+// If the data size is negative or zero, which means no space is allocated for
+// the container data pointer, this operator does not allow to return the data
+// element.
+template <typename T, std::size_t N>
+const T &MemRef<T, N>::operator[](size_t index) const {
+  size_t size = product(this->sizes);
+  return aligned[index + offset];
+}
+
+template <typename T, std::size_t N> T &MemRef<T, N>::operator[](size_t index) {
+  size_t size = product(this->sizes);
+  return aligned[index + offset];
+}
+
+// Calculate the stride values for each dimension based on the sizes.
+template <typename T, std::size_t N> inline void MemRef<T, N>::setStrides() {
+  strides[N - 1] = 1;
+  if (N < 2)
+    return;
+  // Prevent implicit conversions between unsigned and signed
+  for (std::size_t i = N - 1; i > 0; i--) {
+    strides[i - 1] = strides[i] * sizes[i];
+  }
+}
+
+// Calculate the total number of elements in the MemRef container.
+template <typename T, std::size_t N>
+inline size_t MemRef<T, N>::product(const intptr_t sizes[N]) const {
+  size_t size = 1;
+  for (size_t i = 0; i < N; i++)
+    size *= sizes[i];
+  return size;
+}
diff --git a/tests/pytorch/include/img.hpp b/tests/pytorch/include/img.hpp
deleted file mode 100644
index f839df342..000000000
--- a/tests/pytorch/include/img.hpp
+++ /dev/null
@@ -1,234 +0,0 @@
-//===- ImgContainer.h -----------------------------------------------------===//
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-//
-// original source from https://github.com/buddy-compiler/buddy-mlir
-// Modified for embedded environment.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef IMG_H
-#define IMG_H
-
-#include "memref.hpp"
-
-enum ImageModes {
-  DIP_GRAYSCALE = 0,
-  DIP_RGB = 1,
-};
-
-template <typename T, size_t N> class Image : public MemRef<T, N> {
-public:
-  // Constructor initializes the image by loading from a file.
-  // Params:
-  //   file: Raw data to image file memory
-  //   filesize: size of the file memory
-  //   buffer: pre-allocated space for image modification
-  //   mode: Specifies the image mode (e.g., DIP_GRAYSCALE, DIP_RGB).
-  //   norm: Indicates whether to normalize pixel values (default is false).
-  constexpr Image(const char *file, int32_t filesize, T *buffer,
-                  ImageModes mode, bool norm = false);
-
-  // Overload
-  constexpr Image(T *data, const int32_t sizes[N]);
-  constexpr Image(T *data, T init, const int32_t sizes[N]);
-
-  // Retrieves the name of the current image format as a string.
-  inline const char *getFormatName() const {
-    switch (this->imageFormat) {
-    case ImageFormat::BMP:
-      return "BMP";
-    default:
-      return "Unsupported format";
-    }
-  }
-  // Returns the width of the image in pixels.
-  inline size_t getWidth() const { return this->width; }
-  // Returns the height of the image in pixels.
-  inline size_t getHeight() const { return this->height; }
-  // Returns the bit depth of the image.
-  inline int getBitDepth() const { return this->bitDepth; }
-
-private:
-  // Enum to represent supported image formats.
-  enum class ImageFormat {
-    BMPDecodeError, // Represents an error or unsupported format.
-    BMP,            // BMP file format.
-    Unsupported,
-  } imageFormat;
-  // Mode of the image (e.g., DIP_GRAYSCALE, DIP_RGB).
-  ImageModes imageMode;
-  // Width of the image in pixels.
-  int32_t width;
-  // Height of the image in pixels.
-  int32_t height;
-  // Bit depth of the image.
-  int32_t bitDepth;
-  // Normalization flag.
-  bool isNorm;
-  // Determines the image format from raw file data.
-  inline void determineFormat(const uint8_t *fileData, uint32_t filesize);
-  // Decodes a BMP image from raw file data.
-  inline bool decodeBMP(const uint8_t *fileData, uint32_t filesize);
-};
-
-template <typename T, std::size_t N>
-constexpr Image<T, N>::Image(T *data, const int32_t sizes[N])
-    : MemRef<T, N>(data, sizes){};
-
-template <typename T, std::size_t N>
-constexpr Image<T, N>::Image(T *data, T init, const int32_t sizes[N])
-    : MemRef<T, N>(data, init, sizes){};
-
-// Image Container Constructor
-// Constructs an image container object from the image file path.
-template <typename T, std::size_t N>
-constexpr Image<T, N>::Image(const char *file, int32_t filesize, T *buffer,
-                             ImageModes mode, bool norm)
-    : imageMode(mode), isNorm(norm) {
-
-  this->allocated = buffer;
-  this->aligned = this->allocated;
-
-  determineFormat(file, filesize);
-  if (this->imageFormat == ImageFormat::BMP) {
-    bool success = decodeBMP(file, filesize);
-    if (!success) {
-      this->imageFormat = ImageFormat::BMPDecodeError;
-    };
-  } else {
-    this->imageFormat = ImageFormat::Unsupported;
-  }
-}
-
-// Determines the image format by inspecting the header of the file data.
-template <typename T, std::size_t N>
-inline void Image<T, N>::determineFormat(const uint8_t *fileData,
-                                         uint32_t filesize) {
-  if (filesize > 2 && fileData[0] == 'B' && fileData[1] == 'M') {
-    this->imageFormat = ImageFormat::BMP;
-  } else {
-    this->imageFormat = ImageFormat::BMPDecodeError;
-  }
-}
-
-// BMP Image File Decoder
-template <typename T, std::size_t N>
-inline bool Image<T, N>::decodeBMP(const uint8_t *fileData, uint32_t filesize) {
-  // Check if the provided data is large enough to contain a minimal BMP header
-  // (54 bytes).
-  if (filesize < 54) {
-    return false;
-  }
-
-  // Extract image information from BMP header
-  this->width = *reinterpret_cast<const int32_t *>(&fileData[18]);
-  this->height = *reinterpret_cast<const int32_t *>(&fileData[22]);
-  this->bitDepth = *reinterpret_cast<const uint16_t *>(&fileData[28]);
-  uint32_t compression = *reinterpret_cast<const uint32_t *>(&fileData[30]);
-  size_t pixelDataOffset = *reinterpret_cast<const uint32_t *>(&fileData[10]);
-
-  // Currently, only the BI_RGB (value 0) compression method is supported.
-  if (compression != 0) {
-    return false;
-  }
-
-  // Currently, only the NCHW format with 4 dimensions is supported.
-  if (N == 4) {
-    if (this->imageMode == ImageModes::DIP_GRAYSCALE) {
-      // TODO: Add batch setting.
-      this->sizes[0] = 1;
-      this->sizes[1] = 1;
-      this->sizes[2] = this->height;
-      this->sizes[3] = this->width;
-      this->setStrides();
-      size_t size = this->product(this->sizes);
-      // Fullfill data to memref container.
-      size_t memrefIndex = 0;
-      if (this->bitDepth == 32) {
-        // BMP file is upside-down storage.
-        for (size_t i = this->height; i > 0; i--) {
-          for (size_t j = 0; j < this->width; j++) {
-            // Locate the current pixel.
-            size_t pixelIndex =
-                pixelDataOffset + (((i - 1) * this->width) + j) * 4;
-            // Extract the blue, green, and red value from the current pixel.
-            int bluePixel =
-                *reinterpret_cast<const uint8_t *>(&fileData[pixelIndex]);
-            int greenPixel =
-                *reinterpret_cast<const uint8_t *>(&fileData[pixelIndex + 1]);
-            int redPixel =
-                *reinterpret_cast<const uint8_t *>(&fileData[pixelIndex + 2]);
-            // Calculate the gray scale value.
-            int grayScaleValue = static_cast<int>(
-                0.299 * redPixel + 0.587 * greenPixel + 0.114 * bluePixel);
-            // Store the gray scale value into memref container.
-            this->aligned[memrefIndex] =
-                this->isNorm ? static_cast<T>(grayScaleValue) / 255
-                             : static_cast<T>(grayScaleValue);
-            memrefIndex++;
-          }
-        }
-      } else {
-        return false;
-      }
-    } else if (this->imageMode == ImageModes::DIP_RGB) {
-      // TODO: Add batch setting.
-      this->sizes[0] = 1;
-      this->sizes[1] = 3;
-      this->sizes[2] = this->height;
-      this->sizes[3] = this->width;
-      this->setStrides();
-      size_t size = this->product(this->sizes);
-      // Fullfill data to memref container.
-      size_t memrefIndex = 0;
-      size_t colorStride = this->height * this->width;
-      if (this->bitDepth == 32) {
-        // BMP file is upside-down storage.
-        for (size_t i = height; i > 0; i--) {
-          for (size_t j = 0; j < width; j++) {
-            // Locate the current pixel.
-            size_t pixelIndex = pixelDataOffset + (((i - 1) * width) + j) * 4;
-            // Extract the blue, green, and red value from the current pixel.
-            int bluePixel =
-                *reinterpret_cast<const uint8_t *>(&fileData[pixelIndex]);
-            int greenPixel =
-                *reinterpret_cast<const uint8_t *>(&fileData[pixelIndex + 1]);
-            int redPixel =
-                *reinterpret_cast<const uint8_t *>(&fileData[pixelIndex + 2]);
-            // Store the values into memref container as RGB order. (BGR -> RGB)
-            this->aligned[memrefIndex] = this->isNorm
-                                             ? static_cast<T>(redPixel) / 255
-                                             : static_cast<T>(redPixel);
-            this->aligned[memrefIndex + colorStride] =
-                this->isNorm ? static_cast<T>(greenPixel) / 255
-                             : static_cast<T>(greenPixel);
-            this->aligned[memrefIndex + 2 * colorStride] =
-                this->isNorm ? static_cast<T>(bluePixel) / 255
-                             : static_cast<T>(bluePixel);
-            memrefIndex++;
-          }
-        }
-      } else {
-        return false;
-      }
-    }
-  } else {
-    return false;
-  }
-  return true;
-}
-
-#endif // IMG_H
diff --git a/tests/pytorch/include/memref.hpp b/tests/pytorch/include/memref.hpp
deleted file mode 100644
index 95e6725f2..000000000
--- a/tests/pytorch/include/memref.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-//===- ImgContainer.h -----------------------------------------------------===//
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-//
-// Original source from https://github.com/buddy-compiler/buddy-mlir.
-// Modified for embedded environment.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MEMREF_H
-#define MEMREF_H
-
-#include <cstddef>
-#include <cstdint>
-
-template <typename T, size_t N> class MemRef {
-public:
-  constexpr MemRef(T *data, const int32_t sizes[N]);
-  constexpr MemRef(T *data, T init, const int32_t sizes[N]);
-
-protected:
-  inline void setStrides();
-
-  // https://github.com/llvm/llvm-project/blob/a50b9633357007ff886f3fd228ca4b8a9b9b9852/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp#L401
-  T *allocated = nullptr;
-  T *aligned = nullptr;
-  int32_t offset = 0;
-  int32_t sizes[N];
-  int32_t strides[N];
-};
-
-template <typename T, std::size_t N>
-constexpr MemRef<T, N>::MemRef(T *data, const int32_t sizes[N]) {
-  for (size_t i = 0; i < N; i++) {
-    this->sizes[i] = sizes[i];
-  }
-
-  setStrides();
-
-  allocated = data;
-  aligned = data;
-}
-
-template <typename T, std::size_t N>
-constexpr MemRef<T, N>::MemRef(T *data, T init, const int32_t sizes[N])
-    : MemRef(data, sizes) {
-
-  int32_t total_size = 0;
-  for (size_t i = 0; i < N; i++) {
-    total_size += sizes[i];
-  }
-
-  for (int32_t i = 0; i < total_size; i++) {
-    aligned[i] = init;
-  }
-}
-
-template <typename T, std::size_t N> inline void MemRef<T, N>::setStrides() {
-  strides[N - 1] = 1;
-  if (N < 2)
-    return;
-
-  for (std::size_t i = N - 1; i > 0; i--) {
-    strides[i - 1] = strides[i] * sizes[i];
-  }
-}
-
-#endif // MEMREF_H
diff --git a/tests/pytorch/lenet/build.nix b/tests/pytorch/lenet/build.nix
index baabc0454..31357cc4d 100644
--- a/tests/pytorch/lenet/build.nix
+++ b/tests/pytorch/lenet/build.nix
@@ -14,43 +14,42 @@ buildBuddyE2ETest {
 
     echo "Lowering forward.mlir"
     buddy-opt forward.mlir -pass-pipeline \
-        "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), \
+        "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith{use-32-bit}), \
                                   empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, \
                                   func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" \
       | buddy-opt -pass-pipeline \
         "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), \
                                   eliminate-empty-tensors, func.func(llvm-request-c-wrappers), \
-                                  convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, \
-                                  convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, \
-                                  convert-func-to-llvm, reconcile-unrealized-casts)" \
+                                  convert-math-to-llvm, convert-scf-to-cf, \
+                                  convert-arith-to-llvm{index-bitwidth=32}, expand-strided-metadata, finalize-memref-to-llvm{index-bitwidth=32}, \
+                                  convert-func-to-llvm{index-bitwidth=32}, reconcile-unrealized-casts)" \
       > forward-lowered.mlir
 
     echo "Lowering subgraphs[0]"
     buddy-opt subgraphs0.mlir -pass-pipeline \
-        "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" \
+        "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith{use-32-bit}, tosa-to-linalg, tosa-to-tensor))" \
       | buddy-opt \
-          --convert-elementwise-to-linalg \
+          --eliminate-empty-tensors \
+          --convert-tensor-to-linalg \
+          --linalg-bufferize \
+          --convert-linalg-to-loops \
+          --lower-affine \
           --func-bufferize-dynamic-offset \
           --arith-bufferize \
-          --func-bufferize \
           --tensor-bufferize \
-          --linalg-bufferize \
+          --buffer-deallocation \
           --finalizing-bufferize \
-          --batchmatmul-optimize \
-          --convert-linalg-to-affine-loops \
-          --lower-affine \
-          --convert-vector-to-scf \
-          --convert-scf-to-cf \
+          --memref-expand \
           --llvm-request-c-wrappers \
+          --convert-vector-to-llvm=force-32bit-vector-indices \
           --lower-vector-exp \
           --lower-rvv=rv32 \
-          --convert-vector-to-llvm \
-          --convert-math-to-llvm \
-          --convert-math-to-libm \
-          --convert-arith-to-llvm \
-          --convert-func-to-llvm \
           --expand-strided-metadata \
-          --finalize-memref-to-llvm \
+          --finalize-memref-to-llvm=index-bitwidth=32 \
+          --convert-index-to-llvm=index-bitwidth=32 \
+          --convert-scf-to-cf \
+          --convert-arith-to-llvm=index-bitwidth=32 \
+          --convert-func-to-llvm=index-bitwidth=32 \
           --reconcile-unrealized-casts \
       > subgraphs0-lowered.mlir
 
@@ -58,5 +57,8 @@ buildBuddyE2ETest {
       "forward-lowered.mlir"
       "subgraphs0-lowered.mlir"
     )
+
+    mkdir -p "$out/resources"
+    cp -v ''${optArtifacts[*]} "$out/resources"
   '';
 }
diff --git a/tests/pytorch/lenet/lenet.cc b/tests/pytorch/lenet/lenet.cc
index 3cb5d1a88..226ffe04c 100644
--- a/tests/pytorch/lenet/lenet.cc
+++ b/tests/pytorch/lenet/lenet.cc
@@ -1,4 +1,4 @@
-#include "memref.hpp"
+#include <buddy/Core/Container.h>
 
 #define INPUT_N 1
 #define INPUT_C 1
@@ -13,14 +13,9 @@ __attribute((section(".vdata"))) float output_0[OUTPUT_N];
 __attribute((section(".vdata"))) float param_0[PARAM_N];
 
 // Define the sizes of the input and output tensors.
-static const int32_t sizesInput[4] = {INPUT_N, INPUT_C, INPUT_H, INPUT_W};
-static const int32_t sizesOutput[2] = {1, OUTPUT_N};
-static const int32_t sizesParams[1] = {PARAM_N};
-
-// Create input and output containers for the image and model output.
-MemRef<float, 4> input(input_0, sizesInput);
-MemRef<float, 2> output(output_0, sizesOutput);
-MemRef<float, 1> params(param_0, 2.0, sizesParams);
+static int32_t sizesInput[4] = {INPUT_N, INPUT_C, INPUT_H, INPUT_W};
+static int32_t sizesOutput[2] = {1, OUTPUT_N};
+static int32_t sizesParams[1] = {PARAM_N};
 
 // Declare the target model C interface.
 extern "C" {
@@ -29,6 +24,11 @@ void _mlir_ciface_forward(MemRef<float, 2> *output, MemRef<float, 1> *arg0,
 }
 
 extern "C" int test() {
+  // Create input and output containers for the image and model output.
+  MemRef<float, 4> input(input_0, sizesInput);
+  MemRef<float, 2> output(output_0, sizesOutput);
+  MemRef<float, 1> params(param_0, 2.0, sizesParams);
+
   _mlir_ciface_forward(&output, &params, &input);
   return 0;
 }
diff --git a/tests/pytorch/lib/MemrefCopy.cc b/tests/pytorch/lib/MemrefCopy.cc
index c7f15ff79..073333331 100644
--- a/tests/pytorch/lib/MemrefCopy.cc
+++ b/tests/pytorch/lib/MemrefCopy.cc
@@ -24,38 +24,38 @@ template <typename T> struct UnrankedMemRefType {
 template <typename T, int N> struct StridedMemRefType {
   T *basePtr;
   T *data;
-  int64_t offset;
-  int64_t sizes[N];
-  int64_t strides[N];
+  int32_t offset;
+  int32_t sizes[N];
+  int32_t strides[N];
 };
 
 /// StridedMemRef descriptor type specialized for rank 1.
 template <typename T> struct StridedMemRefType<T, 1> {
   T *basePtr;
   T *data;
-  int64_t offset;
-  int64_t sizes[1];
-  int64_t strides[1];
+  int32_t offset;
+  int32_t sizes[1];
+  int32_t strides[1];
 
-  T &operator[](int64_t idx) { return *(data + offset + idx * strides[0]); }
+  T &operator[](int32_t idx) { return *(data + offset + idx * strides[0]); }
 };
 
 /// StridedMemRef descriptor type specialized for rank 0.
 template <typename T> struct StridedMemRefType<T, 0> {
   T *basePtr;
   T *data;
-  int64_t offset;
+  int32_t offset;
 };
 
 // A reference to one of the StridedMemRef types.
 template <typename T> class DynamicMemRefType {
 public:
-  int64_t rank;
+  int32_t rank;
   T *basePtr;
   T *data;
-  int64_t offset;
-  const int64_t *sizes;
-  const int64_t *strides;
+  int32_t offset;
+  const int32_t *sizes;
+  const int32_t *strides;
 
   explicit DynamicMemRefType(const StridedMemRefType<T, 0> &memRef)
       : rank(0), basePtr(memRef.basePtr), data(memRef.data),
@@ -75,6 +75,7 @@ template <typename T> class DynamicMemRefType {
   }
 };
 
+// TODO: can we vectorize this
 extern "C" void memrefCopy(int32_t elemSize, UnrankedMemRefType<char> *srcArg,
                            UnrankedMemRefType<char> *dstArg) {
   DynamicMemRefType<char> src(*srcArg);
diff --git a/tests/pytorch/matmul/build.nix b/tests/pytorch/matmul/build.nix
index 9e84d8219..67cd326c1 100644
--- a/tests/pytorch/matmul/build.nix
+++ b/tests/pytorch/matmul/build.nix
@@ -6,29 +6,26 @@ buildBuddyE2ETest {
     echo "Lowering forward.mlir"
 
     python ./matmul.py \
-      | buddy-opt --pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" \
-      | buddy-opt --convert-elementwise-to-linalg \
+      | buddy-opt --pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith{use-32-bit}, tosa-to-linalg, tosa-to-tensor))" \
+    | buddy-opt \
+        --convert-elementwise-to-linalg \
+        --one-shot-bufferize="bufferize-function-boundaries" \
         --func-bufferize-dynamic-offset \
-        --arith-bufferize \
-        --func-bufferize \
-        --tensor-bufferize \
-        --linalg-bufferize \
-        --finalizing-bufferize \
-        --batchmatmul-optimize \
         --convert-linalg-to-affine-loops \
+        --batchmatmul-optimize \
         --lower-affine \
         --lower-vector-exp \
         --lower-rvv=rv32 \
         --convert-vector-to-scf \
         --convert-scf-to-cf \
+        --convert-cf-to-llvm \
         --llvm-request-c-wrappers \
         --convert-vector-to-llvm \
         --convert-math-to-llvm \
-        --convert-math-to-libm \
-        --convert-arith-to-llvm \
-        --convert-func-to-llvm \
+        --convert-arith-to-llvm=index-bitwidth=32 \
+        --convert-func-to-llvm=index-bitwidth=32 \
         --expand-strided-metadata \
-        --finalize-memref-to-llvm \
+        --finalize-memref-to-llvm=index-bitwidth=32 \
         --reconcile-unrealized-casts \
         -o forward-lowered.mlir
 
diff --git a/tests/pytorch/matmul/matmul.cc b/tests/pytorch/matmul/matmul.cc
index b523f0626..2e1a020dd 100644
--- a/tests/pytorch/matmul/matmul.cc
+++ b/tests/pytorch/matmul/matmul.cc
@@ -1,22 +1,22 @@
-#include "memref.hpp"
+#include <buddy/Core/Container.h>
 
 extern "C" void _mlir_ciface_forward(MemRef<float, 1> *output,
                                      MemRef<float, 1> *arg1,
                                      MemRef<float, 1> *arg2);
 
-// One-dimension, with length 512
-static const int32_t sizes[3] = {8, 8, 8};
-
 __attribute((section(".vdata"))) float input_float_1[512];
-MemRef<float, 1> input1(input_float_1, sizes);
-
 __attribute((section(".vdata"))) float input_float_2[512];
-MemRef<float, 1> input2(input_float_2, sizes);
-
 __attribute((section(".vdata"))) float output_float_1[512];
-MemRef<float, 1> output(output_float_1, sizes);
 
 extern "C" int test() {
+  // One-dimension, with length 512
+  static int32_t sizes[3] = {8, 8, 8};
+
+  MemRef<float, 1> input1(input_float_1, sizes);
+  MemRef<float, 1> input2(input_float_2, sizes);
+  MemRef<float, 1> output(output_float_1, sizes);
+
   _mlir_ciface_forward(&output, &input1, &input2);
+
   return 0;
 }
diff --git a/tests/pytorch/mobilenet/build.nix b/tests/pytorch/mobilenet/build.nix
index ac666cc68..2eddc68c6 100644
--- a/tests/pytorch/mobilenet/build.nix
+++ b/tests/pytorch/mobilenet/build.nix
@@ -20,20 +20,20 @@ buildBuddyE2ETest {
 
     echo "Lowering forward.mlir"
     buddy-opt forward.mlir -pass-pipeline \
-        "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), \
+        "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith{use-32-bit}), \
               empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, \
               func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" \
       | buddy-opt -pass-pipeline \
         "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), \
               eliminate-empty-tensors, func.func(llvm-request-c-wrappers), \
-              convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, \
-              convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, \
-              convert-func-to-llvm, reconcile-unrealized-casts)" \
+              convert-math-to-llvm, convert-scf-to-cf, \
+              convert-arith-to-llvm{index-bitwidth=32}, expand-strided-metadata, finalize-memref-to-llvm{index-bitwidth=32}, \
+              convert-func-to-llvm{index-bitwidth=32}, reconcile-unrealized-casts)" \
       > forward-lowered.mlir
 
     echo "Lowering subgraphs[0]"
     buddy-opt subgraphs0.mlir -pass-pipeline \
-        "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" \
+        "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith{use-32-bit}, tosa-to-linalg, tosa-to-tensor))" \
       | buddy-opt \
           --convert-elementwise-to-linalg \
           --func-bufferize-dynamic-offset \
@@ -42,7 +42,6 @@ buildBuddyE2ETest {
           --tensor-bufferize \
           --linalg-bufferize \
           --finalizing-bufferize \
-          --batchmatmul-optimize \
           --convert-linalg-to-affine-loops \
           --lower-affine \
           --convert-vector-to-scf \
@@ -52,11 +51,10 @@ buildBuddyE2ETest {
           --lower-rvv=rv32 \
           --convert-vector-to-llvm \
           --convert-math-to-llvm \
-          --convert-math-to-libm \
-          --convert-arith-to-llvm \
-          --convert-func-to-llvm \
+          --convert-arith-to-llvm=index-bitwidth=32 \
+          --convert-func-to-llvm=index-bitwidth=32 \
           --expand-strided-metadata \
-          --finalize-memref-to-llvm \
+          --finalize-memref-to-llvm=index-bitwidth=32 \
           --reconcile-unrealized-casts \
       > subgraphs0-lowered.mlir
 
diff --git a/tests/pytorch/mobilenet/mobilenet.cc b/tests/pytorch/mobilenet/mobilenet.cc
index d64064fbb..8b398d074 100644
--- a/tests/pytorch/mobilenet/mobilenet.cc
+++ b/tests/pytorch/mobilenet/mobilenet.cc
@@ -15,18 +15,17 @@ __attribute((section(".vdata"))) float output_0[OUTPUT_N];
 __attribute((section(".vdata"))) float param_0[PARAM_N0];
 __attribute((section(".vdata"))) int64_t param_1[PARAM_N1];
 
-// Define the sizes of the input and output tensors.
-static const int32_t sizesInput[4] = {INPUT_N, INPUT_C, INPUT_H, INPUT_W};
-static const int32_t sizesOutput[2] = {1, OUTPUT_N};
-static const int32_t sizesParam0[1] = {PARAM_N0};
-static const int32_t sizesParam1[1] = {PARAM_N1};
-
 extern "C" {
 void _mlir_ciface_forward(MemRef<float, 2> *output, MemRef<float, 1> *arg0,
                           MemRef<int64_t, 1> *arg1, Image<float, 4> *input);
 }
 
 extern "C" int test() {
+  // Define the sizes of the input and output tensors.
+  static int32_t sizesInput[4] = {INPUT_N, INPUT_C, INPUT_H, INPUT_W};
+  static int32_t sizesOutput[2] = {1, OUTPUT_N};
+  static int32_t sizesParam0[1] = {PARAM_N0};
+  static int32_t sizesParam1[1] = {PARAM_N1};
 
   // Generate input memref container with random numbers.
   const int inputSize = INPUT_N * INPUT_C * INPUT_H * INPUT_W;
diff --git a/tests/pytorch/resnet18/build.nix b/tests/pytorch/resnet18/build.nix
new file mode 100644
index 000000000..a143aace9
--- /dev/null
+++ b/tests/pytorch/resnet18/build.nix
@@ -0,0 +1,76 @@
+{
+  fetchurl,
+  buildBuddyE2ETest,
+}:
+let
+  checkpointFile = "resnet18-f37072fd.pth";
+  modelCache = fetchurl {
+    url = "https://download.pytorch.org/models/${checkpointFile}";
+    hash = "sha256-83By/UfonF6CdiHFuv+nUAgZ94lrus7BYLGhbFYOB+w=";
+  };
+in
+buildBuddyE2ETest {
+  caseName = "resnet18";
+
+  optPhase = ''
+    mkdir -p pytorchCache/hub/checkpoints/
+    cp -v ${modelCache} pytorchCache/hub/checkpoints/${checkpointFile}
+    export TORCH_HOME=pytorchCache
+
+    python3 ./resnet18.py --output-dir $PWD
+
+    echo "Lowering forward.mlir"
+    buddy-opt forward.mlir -pass-pipeline \
+        "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith{use-32-bit}), \
+            empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, convert-vector-to-llvm{force-32bit-vector-indices index-bitwidth=32})" \
+      | buddy-opt -pass-pipeline \
+        "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), \
+            eliminate-empty-tensors, func.func(llvm-request-c-wrappers), \
+            convert-math-to-llvm, convert-scf-to-cf, \
+            convert-arith-to-llvm{index-bitwidth=32}, convert-func-to-llvm{index-bitwidth=32}, \
+            expand-strided-metadata, finalize-memref-to-llvm{index-bitwidth=32}, \
+            convert-func-to-llvm{index-bitwidth=32}, reconcile-unrealized-casts)" \
+      > forward-lowered.mlir
+
+    echo "Lowering subgraph0.mlir"
+    buddy-opt subgraph0.mlir -pass-pipeline \
+        "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith{use-32-bit}, tosa-to-linalg, tosa-to-tensor))" \
+      | buddy-opt \
+          --convert-elementwise-to-linalg \
+          --one-shot-bufferize="bufferize-function-boundaries" \
+          --func-bufferize-dynamic-offset \
+          --conv-nhwc-fhwc-optimize \
+          --batchmatmul-optimize \
+          --convert-linalg-to-loops \
+          --expand-strided-metadata \
+          --llvm-request-c-wrappers \
+          --lower-affine \
+          --convert-vector-to-llvm="force-32bit-vector-indices index-bitwidth=32" \
+          --convert-scf-to-cf \
+          --convert-cf-to-llvm=index-bitwidth=32 \
+          --convert-arith-to-llvm=index-bitwidth=32 \
+          --convert-math-to-llvm \
+          --convert-func-to-llvm=index-bitwidth=32 \
+          --finalize-memref-to-llvm=index-bitwidth=32 \
+          --convert-index-to-llvm=index-bitwidth=32 \
+          --reconcile-unrealized-casts \
+          --mlir-print-ir-after-change \
+          --verify-each \
+      > subgraph0-lowered.mlir
+
+    echo "Compiling memrefCopy library"
+    $CXX -nostdlib -c ${../lib/MemrefCopy.cc} -o memrefCopy.o
+
+    llcArtifacts+=(
+      memrefCopy.o
+    )
+
+    optArtifacts+=(
+      "forward-lowered.mlir"
+      "subgraph0-lowered.mlir"
+    )
+
+    mkdir -p "$out"/share
+    cp -v ''${optArtifacts[*]} "$out"/share/
+  '';
+}
diff --git a/tests/pytorch/resnet18/dog-224_224.png b/tests/pytorch/resnet18/dog-224_224.png
new file mode 100644
index 000000000..4c6649714
Binary files /dev/null and b/tests/pytorch/resnet18/dog-224_224.png differ
diff --git a/tests/pytorch/resnet18/resnet18.cc b/tests/pytorch/resnet18/resnet18.cc
new file mode 100644
index 000000000..05378626b
--- /dev/null
+++ b/tests/pytorch/resnet18/resnet18.cc
@@ -0,0 +1,24 @@
+#include <buddy/Core/Container.h>
+
+// Declare the resnet C interface.
+extern "C" void _mlir_ciface_forward(MemRef<float, 2> *output,
+                                     MemRef<float, 1> *arg0,
+                                     MemRef<float, 4> *input);
+
+__attribute((section(".vdata"))) float output_float_1[1000];
+__attribute((section(".vdata"))) float IMAGE[150528];
+__attribute((section(".vdata"))) float PARAMS[11699112];
+
+extern "C" int test() {
+  static int32_t sizes[2] = {1, 1000};
+  static int32_t params_sizes[1] = {11699112};
+  static int32_t image_sizes[4] = {1, 3, 224, 224};
+
+  MemRef<float, 2> output(output_float_1, 7.0, sizes);
+
+  MemRef<float, 4> inputResize(IMAGE, 6.0, image_sizes);
+  MemRef<float, 1> paramsContainer(PARAMS, 5.0, params_sizes);
+
+  _mlir_ciface_forward(&output, &paramsContainer, &inputResize);
+  return 0;
+}
diff --git a/tests/pytorch/resnet18/resnet18.py b/tests/pytorch/resnet18/resnet18.py
new file mode 100644
index 000000000..1cbc61bc4
--- /dev/null
+++ b/tests/pytorch/resnet18/resnet18.py
@@ -0,0 +1,98 @@
+# ===- buddy-resnet-import.py --------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# This is the ResNet18 model AOT importer.
+#
+# ===---------------------------------------------------------------------------
+
+import os
+import argparse
+from pathlib import Path
+import numpy as np
+import torch
+import torchvision.models as models
+import torch._inductor.lowering
+from torch._inductor.decomposition import decompositions as inductor_decomp
+from torch._decomp import remove_decompositions
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.graph import GraphDriver
+from buddy.compiler.graph.transform import simply_fuse
+from buddy.compiler.ops import tosa
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser(description="ResNet18 model AOT importer")
+parser.add_argument(
+    "--output-dir", type=str, default="./", help="Directory to save output files."
+)
+args = parser.parse_args()
+
+# Ensure output directory exists
+output_dir = os.path.abspath(args.output_dir)
+os.makedirs(output_dir, exist_ok=True)
+
+# Retrieve the ResNet18 model path.
+model_path = os.path.dirname(os.path.abspath(__file__))
+
+model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
+model = model.eval()
+
+# Remove the num_batches_tracked attribute.
+for layer in model.modules():
+    if isinstance(layer, torch.nn.BatchNorm2d):
+        if hasattr(layer, "num_batches_tracked"):
+            del layer.num_batches_tracked
+
+DEFAULT_DECOMPOSITIONS = [
+    torch.ops.aten.max_pool2d_with_indices.default,
+]
+
+remove_decompositions(inductor_decomp, DEFAULT_DECOMPOSITIONS)
+
+# Initialize Dynamo Compiler with specific configurations as an importer.
+dynamo_compiler = DynamoCompiler(
+    primary_registry=tosa.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+data = torch.randn([1, 3, 224, 224])
+# Import the model into MLIR module and parameters.
+with torch.no_grad():
+    graphs = dynamo_compiler.importer(model, data)
+assert len(graphs) == 1
+graph = graphs[0]
+params = dynamo_compiler.imported_params[graph]
+pattern_list = [simply_fuse]
+graphs[0].fuse_ops(pattern_list)
+driver = GraphDriver(graphs[0])
+driver.subgraphs[0].lower_to_top_level_ir()
+
+# Write the MLIR module and forward graph to the specified output directory
+with open(os.path.join(output_dir, "subgraph0.mlir"), "w") as module_file:
+    print(driver.subgraphs[0]._imported_module, file=module_file)
+with open(os.path.join(output_dir, "forward.mlir"), "w") as module_file:
+    print(driver.construct_main_graph(True), file=module_file)
+
+params = dynamo_compiler.imported_params[graph]
+current_path = os.path.dirname(os.path.abspath(__file__))
+
+float32_param = np.concatenate(
+    [
+        param.detach().numpy().reshape([-1])
+        for param in params
+        if param.dtype == torch.float32
+    ]
+)
+float32_param.tofile(Path(output_dir) / "arg0.data")
diff --git a/tests/pytorch/tinyllama/build.nix b/tests/pytorch/tinyllama/build.nix
index 24e4c00d7..e07a2a895 100644
--- a/tests/pytorch/tinyllama/build.nix
+++ b/tests/pytorch/tinyllama/build.nix
@@ -14,45 +14,21 @@ buildBuddyE2ETest {
 
   env.LLAMA_MODEL_PATH = "${model}";
   optPhase = ''
-    python ./tinyllama.py
+    python3 ./tinyllama.py --output-dir $PWD
 
     echo "Lowering forward.mlir"
-    buddy-opt forward.mlir -pass-pipeline \
-      "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),\
-      func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
-      | buddy-opt --arith-expand \
-        --eliminate-empty-tensors \
-        --empty-tensor-to-alloc-tensor \
-        --one-shot-bufferize \
-        --matmul-parallel-vectorization-optimize \
-        --batchmatmul-optimize \
-        --convert-linalg-to-affine-loops \
-        --affine-loop-fusion \
-        --lower-affine \
-        --func-bufferize \
-        --arith-bufferize \
-        --tensor-bufferize \
-        --buffer-deallocation \
-        --finalizing-bufferize \
-        --convert-vector-to-scf \
-        --expand-strided-metadata \
-        --convert-vector-to-llvm \
-        --memref-expand \
-        --arith-expand \
-        --convert-arith-to-llvm \
-        --finalize-memref-to-llvm \
-        --convert-scf-to-cf \
-        --llvm-request-c-wrappers \
-        --convert-arith-to-llvm \
-        --convert-math-to-llvm \
-        --convert-math-to-libm  \
-        --convert-func-to-llvm \
-        --reconcile-unrealized-casts \
+      cat forward.mlir \
+      | buddy-opt \
+          --expand-strided-metadata \
+          --finalize-memref-to-llvm=index-bitwidth=32 \
+          --llvm-request-c-wrappers \
+          --convert-func-to-llvm=index-bitwidth=32 \
+          --reconcile-unrealized-casts \
       > forward-lowered.mlir
 
     echo "Lowering subgraphs[0]"
-    buddy-opt subgraphs0.mlir -pass-pipeline \
-        "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" \
+    buddy-opt subgraph0.mlir -pass-pipeline \
+        "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith{use-32-bit}, tosa-to-linalg, tosa-to-tensor))" \
       | buddy-opt \
           --convert-elementwise-to-linalg \
           --arith-expand \
@@ -64,29 +40,26 @@ buildBuddyE2ETest {
           --arith-bufferize \
           --buffer-deallocation \
           --finalizing-bufferize \
-          --matmul-parallel-vectorization-optimize \
-          --batchmatmul-optimize \
           --convert-linalg-to-affine-loops \
           --affine-loop-fusion \
           --lower-affine \
           --convert-vector-to-scf \
           --expand-strided-metadata \
+          --llvm-request-c-wrappers \
           --cse \
           --lower-vector-exp \
           --lower-rvv=rv32 \
           --convert-vector-to-llvm \
           --memref-expand \
           --arith-expand \
-          --convert-arith-to-llvm \
-          --finalize-memref-to-llvm \
+          --convert-arith-to-llvm=index-bitwidth=32 \
+          --finalize-memref-to-llvm=index-bitwidth=32 \
           --convert-scf-to-cf \
-          --llvm-request-c-wrappers \
-          --convert-arith-to-llvm \
+          --convert-arith-to-llvm=index-bitwidth=32 \
           --convert-math-to-llvm \
-          --convert-math-to-libm  \
-          --convert-func-to-llvm \
+          --convert-func-to-llvm=index-bitwidth=32 \
           --reconcile-unrealized-casts \
-      > subgraphs0-lowered.mlir
+      > subgraph0-lowered.mlir
 
     echo "Compiling memrefCopy library"
     $CXX -nostdlib -c ${../lib/MemrefCopy.cc} -o memrefCopy.o
@@ -96,7 +69,10 @@ buildBuddyE2ETest {
 
     optArtifacts+=(
       "forward-lowered.mlir"
-      "subgraphs0-lowered.mlir"
+      "subgraph0-lowered.mlir"
     )
+
+    mkdir -p "$out/resources"
+    cp -v ''${optArtifacts[*]} "$out/resources"
   '';
 }
diff --git a/tests/pytorch/tinyllama/tinyllama.cc b/tests/pytorch/tinyllama/tinyllama.cc
index 705d16985..b53d2ade1 100644
--- a/tests/pytorch/tinyllama/tinyllama.cc
+++ b/tests/pytorch/tinyllama/tinyllama.cc
@@ -18,43 +18,44 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "memref.hpp"
+#include <buddy/Core/Container.h>
+#include <cstdio>
 
-constexpr size_t ParamsSize = 110581;
-// constexpr size_t ParamsSize = 11058;
-constexpr size_t MaxVocabSize = 32000;
-constexpr size_t MaxTokenLength = 40;
-constexpr size_t HiddenSize = 2048;
+#define PARAMS_SIZE 673
+#define MAX_VOCAB_SIZE 320
+#define MAX_TOKEN_LENGTH 40
+#define HIDDEN_SIZE 128
 
 // resultContainer[0]
-__attribute((section(".vdata"))) float result0[1 + MaxTokenLength + HiddenSize];
-static constexpr int32_t sizesResult0[3] = {1, MaxTokenLength, HiddenSize};
-
+__attribute((
+    section(".vdata"))) float result0[1 + MAX_TOKEN_LENGTH + HIDDEN_SIZE];
 // resultContainer[1]
 __attribute((
-    section(".vdata"))) float result1[1 + MaxTokenLength + MaxVocabSize];
-static constexpr int32_t sizesResult1[3] = {1, MaxTokenLength, MaxVocabSize};
-
+    section(".vdata"))) float result1[1 + MAX_TOKEN_LENGTH + MAX_VOCAB_SIZE];
 // inputContainer
-__attribute((section(".vdata"))) int32_t input[1 + MaxTokenLength];
-static constexpr int32_t sizesInput[2] = {1, MaxTokenLength};
-
+__attribute((section(".vdata"))) int32_t input[1 + MAX_TOKEN_LENGTH];
 // paramsContainer
-__attribute((section(".vdata"))) float param[ParamsSize];
-static constexpr int32_t sizesParam[1] = {ParamsSize};
+__attribute((section(".vdata"))) float param[PARAMS_SIZE];
 
 extern "C" {
 void _mlir_ciface_forward(MemRef<float, 3> *a, MemRef<float, 1> *b,
                           MemRef<int32_t, 2> *c);
 }
 
-MemRef<float, 3> resultContainer[2] = {
-    MemRef<float, 3>(result0, 2.0, sizesResult0),
-    MemRef<float, 3>(result1, 3.0, sizesResult1)};
-MemRef<int32_t, 2> inputContainer(input, 4, sizesInput);
-MemRef<float, 1> paramsContainerf32(param, 5.0, sizesParam);
-
 extern "C" int test() {
+  int32_t sizesResult0[3] = {1, MAX_TOKEN_LENGTH, HIDDEN_SIZE};
+  int32_t sizesResult1[3] = {1, MAX_TOKEN_LENGTH, MAX_VOCAB_SIZE};
+
+  int32_t sizesInput[2] = {1, MAX_TOKEN_LENGTH};
+  MemRef<int32_t, 2> inputContainer(input, 4.0, sizesInput);
+
+  MemRef<float, 3> resultContainer[2] = {
+      MemRef<float, 3>(result0, 2.0, sizesResult0),
+      MemRef<float, 3>(result1, 3.0, sizesResult1)};
+
+  int32_t sizesParam[1] = {PARAMS_SIZE};
+  MemRef<float, 1> paramsContainerf32(param, 5.0, sizesParam);
+
   _mlir_ciface_forward(resultContainer, &paramsContainerf32, &inputContainer);
   return 0;
 }
diff --git a/tests/pytorch/tinyllama/tinyllama.py b/tests/pytorch/tinyllama/tinyllama.py
index bd60c6567..b47bbb605 100644
--- a/tests/pytorch/tinyllama/tinyllama.py
+++ b/tests/pytorch/tinyllama/tinyllama.py
@@ -1,39 +1,37 @@
-# ===- buddy_tinyllama_import.py -----------------------------------------------
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ===---------------------------------------------------------------------------
-#
-# This is the TinyLlama model AOT importer.
-#
-# ===---------------------------------------------------------------------------
-
 import os
-import sys
+import argparse
 import torch
+import torch._dynamo as dynamo
+from transformers import LlamaForCausalLM, LlamaTokenizer
 from torch._inductor.decomposition import decompositions as inductor_decomp
-from transformers import AutoModelForCausalLM, AutoTokenizer
+import numpy
 
 from buddy.compiler.frontend import DynamoCompiler
 from buddy.compiler.ops import tosa
 from buddy.compiler.graph import GraphDriver
-from buddy.compiler.graph.transform import simply_fuse
+from buddy.compiler.graph.transform import simply_fuse, apply_classic_fusion
+
+# Add argument parser to allow custom output directory.
+parser = argparse.ArgumentParser(description="LLaMA2 model AOT importer")
+parser.add_argument(
+    "--output-dir", type=str, default="./", help="Directory to save output files."
+)
+args = parser.parse_args()
 
-checkpoint = os.environ.get("LLAMA_MODEL_PATH")
-if checkpoint is None:
-    sys.exit("Error: No model path was provided. Please set $LLAMA_MODEL_PATH")
-tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")
+# Ensure the output directory exists.
+output_dir = args.output_dir
+os.makedirs(output_dir, exist_ok=True)
+
+# Retrieve the LLaMA model path from environment variables.
+model_path = os.environ.get("LLAMA_MODEL_PATH")
+if model_path is None:
+    raise EnvironmentError(
+        "The environment variable 'LLAMA_MODEL_PATH' is not set or is invalid."
+    )
+
+# Initialize the tokenizer and model from the specified model path.
+tokenizer = LlamaTokenizer.from_pretrained(model_path, legacy=True)
+model = LlamaForCausalLM.from_pretrained(model_path, torchscript=True)
 model.config.use_cache = False
 
 # Initialize Dynamo Compiler with specific configurations as an importer.
@@ -54,7 +52,9 @@
 graphs[0].fuse_ops(pattern_list)
 driver = GraphDriver(graphs[0])
 driver.subgraphs[0].lower_to_top_level_ir()
-with open("subgraphs0.mlir", "w") as module_file:
+
+# Save the generated files to the specified output directory.
+with open(os.path.join(output_dir, "subgraph0.mlir"), "w") as module_file:
     print(driver.subgraphs[0]._imported_module, file=module_file)
-with open("forward.mlir", "w") as module_file:
+with open(os.path.join(output_dir, "forward.mlir"), "w") as module_file:
     print(driver.construct_main_graph(True), file=module_file)