From 4c736e3728e4e789a4b74cb15079fa7e28a8ddb4 Mon Sep 17 00:00:00 2001 From: Eddie-Wang1120 Date: Sun, 16 Feb 2025 15:03:25 +0800 Subject: [PATCH 1/4] commit paper code --- .gitmodules | 4 +- CMakeLists.txt | 7 +- README.md | 19 +- include/bitnet-lut-kernels.h | 627 +++++++++++++++++ include/ggml-bitnet.h | 9 + include/kernel_config.ini | 21 + setup_env.py | 43 +- src/ggml-bitnet-lut.cpp | 74 ++ utils/codegen_tl2_loss.py | 1056 ++++++++++++++++++++++++++++ utils/convert-hf-to-gguf-bitnet.py | 122 +++- 10 files changed, 1956 insertions(+), 26 deletions(-) create mode 100644 include/bitnet-lut-kernels.h create mode 100644 include/kernel_config.ini create mode 100644 utils/codegen_tl2_loss.py diff --git a/.gitmodules b/.gitmodules index 2b36e4928..60c975a19 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,4 @@ [submodule "3rdparty/llama.cpp"] path = 3rdparty/llama.cpp - url = https://github.com/Eddie-Wang1120/llama.cpp.git - branch = merge-dev + url = git@github.com:Eddie-Wang1120/llama.cpp.git + branch = pp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6ddaa51f7..bd1143b9a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) # option list option(BITNET_ARM_TL1 "bitnet.cpp: use tl1 on arm platform" OFF) option(BITNET_X86_TL2 "bitnet.cpp: use tl2 on x86 platform" OFF) +option(BITNET_TL2_LOSS "bitnet.cpp: use tl2 on x86 platform" OFF) set(CMAKE_CXX_STANDARD_REQUIRED true) @@ -24,6 +25,7 @@ set(THREADS_PREFER_PTHREAD_FLAG ON) # override ggml options set(GGML_BITNET_ARM_TL1 ${BITNET_ARM_TL1}) set(GGML_BITNET_X86_TL2 ${BITNET_X86_TL2}) +set(GGML_BITNET_TL2_LOSS ${BITNET_TL2_LOSS}) if (GGML_BITNET_ARM_TL1) add_compile_definitions(GGML_BITNET_ARM_TL1) @@ -31,9 +33,8 @@ endif() if (GGML_BITNET_X86_TL2) add_compile_definitions(GGML_BITNET_X86_TL2) endif() - -if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - add_compile_options(-fpermissive) +if (GGML_BITNET_TL2_LOSS) + add_compile_definitions(GGML_BITNET_TL2_LOSS) endif() find_package(Threads REQUIRED) diff --git a/README.md b/README.md index a439f0a3f..0b6bac53b 100644 --- a/README.md +++ b/README.md @@ -43,8 +43,9 @@ This project is based on the [llama.cpp](https://github.com/ggerganov/llama.cpp) I2_S - TL1 - TL2 + TL1(TL1_1) + TL2(TL2_1) + TL2-Loss(TL2_0) bitnet_b1_58-large @@ -53,12 +54,14 @@ This project is based on the [llama.cpp](https://github.com/ggerganov/llama.cpp) ✅ ❌ ✅ + ✅ ARM ✅ ✅ ❌ + ✅ bitnet_b1_58-3B @@ -67,12 +70,14 @@ This project is based on the [llama.cpp](https://github.com/ggerganov/llama.cpp) ❌ ❌ ✅ + ✅ ARM ❌ ✅ ❌ + ✅ Llama3-8B-1.58-100B-tokens @@ -81,12 +86,14 @@ This project is based on the [llama.cpp](https://github.com/ggerganov/llama.cpp) ✅ ❌ ✅ + ✅ ARM ✅ ✅ ❌ + ✅ Falcon3 Family @@ -95,12 +102,14 @@ This project is based on the [llama.cpp](https://github.com/ggerganov/llama.cpp) ✅ ❌ ✅ + ✅ ARM ✅ ✅ ❌ + ✅ @@ -144,11 +153,11 @@ pip install -r requirements.txt 3. Build the project ```bash # Download the model from Hugging Face, convert it to quantized gguf format, and build the project -python setup_env.py --hf-repo tiiuae/Falcon3-7B-Instruct-1.58bit -q i2_s +python setup_env.py --hf-repo 1bitLLM/bitnet_b1_58-large -q i2_s # Or you can manually download the model and run with local path -huggingface-cli download tiiuae/Falcon3-7B-Instruct-1.58bit --local-dir models/Falcon3-7B-Instruct-1.58bit -python setup_env.py -md models/Falcon3-7B-Instruct-1.58bit -q i2_s +huggingface-cli download 1bitLLM/bitnet_b1_58-large --local-dir models/bitnet_b1_58-large +python setup_env.py -md models/bitnet_b1_58-large -q i2_s ```
 usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]
diff --git a/include/bitnet-lut-kernels.h b/include/bitnet-lut-kernels.h
new file mode 100644
index 000000000..daf470e0a
--- /dev/null
+++ b/include/bitnet-lut-kernels.h
@@ -0,0 +1,627 @@
+#if defined(GGML_BITNET_ARM_TL1)
+#include "ggml-bitnet.h"
+#define GGML_BITNET_MAX_NODES 8192
+static bool initialized = false;
+static bitnet_tensor_extra * bitnet_tensor_extras = nullptr;
+static size_t bitnet_tensor_extras_index = 0;
+static void * aligned_malloc(size_t size) {{
+#if defined(_WIN32)
+    return _aligned_malloc(size, 64);
+#else
+    void * ptr = nullptr;
+    posix_memalign(&ptr, 64, size);
+    return ptr;
+#endif
+}}
+static void aligned_free(void * ptr) {{
+#if defined(_WIN32)
+    _aligned_free(ptr);
+#else
+    free(ptr);
+#endif
+}}
+
+void per_tensor_quant(int k, void* lut_scales_, void* b_) {{
+    bitnet_float_type* lut_scales = (bitnet_float_type*)lut_scales_;
+    bitnet_float_type* b = (bitnet_float_type*)b_;
+#ifdef __ARM_NEON
+    float32x4_t temp_max = vdupq_n_f32(0);
+    for (int i=0; i < k / 4; i++) {{
+      float32x4_t vec_bs = vld1q_f32(b + 4 * i);
+      float32x4_t abssum = vabsq_f32(vec_bs);
+      temp_max = vmaxq_f32(abssum, temp_max);
+    }}
+    float32_t scales = 127 / vmaxvq_f32(temp_max);
+    *lut_scales = scales;
+#elif defined __AVX2__
+    __m256 max_vec = _mm256_set1_ps(0.f);
+    const __m256 vec_sign = _mm256_set1_ps(-0.0f);
+    // #pragma unroll
+    for (int i = 0; i < k / 8; i++) {{
+        __m256 vec_b = _mm256_loadu_ps(b + i * 8);
+        __m256 vec_babs = _mm256_andnot_ps(vec_sign, vec_b);
+        max_vec = _mm256_max_ps(vec_babs, max_vec);
+    }}
+    __m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec));
+    max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1));
+    max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1));
+    float scales = 127 / _mm_cvtss_f32(max1);
+    *lut_scales = scales;
+#endif
+}}
+
+void partial_max_reset(void* lut_scales_) {{
+    bitnet_float_type* lut_scales = (bitnet_float_type*)lut_scales_;
+    *lut_scales = 0.0;
+}}
+
+#ifdef __ARM_NEON
+inline void Transpose_8_8(
+    int16x8_t *v0,
+    int16x8_t *v1,
+    int16x8_t *v2,
+    int16x8_t *v3,
+    int16x8_t *v4,
+    int16x8_t *v5,
+    int16x8_t *v6,
+    int16x8_t *v7)
+{{
+    int16x8x2_t q04 = vzipq_s16(*v0, *v4);
+    int16x8x2_t q15 = vzipq_s16(*v1, *v5);
+    int16x8x2_t q26 = vzipq_s16(*v2, *v6);
+    int16x8x2_t q37 = vzipq_s16(*v3, *v7);
+
+    int16x8x2_t q0246_0 = vzipq_s16(q04.val[0], q26.val[0]);
+    int16x8x2_t q0246_1 = vzipq_s16(q04.val[1], q26.val[1]);
+    int16x8x2_t q1357_0 = vzipq_s16(q15.val[0], q37.val[0]);
+    int16x8x2_t q1357_1 = vzipq_s16(q15.val[1], q37.val[1]);
+
+    int16x8x2_t q_fin_0 = vzipq_s16(q0246_0.val[0], q1357_0.val[0]);
+    int16x8x2_t q_fin_1 = vzipq_s16(q0246_0.val[1], q1357_0.val[1]);
+    int16x8x2_t q_fin_2 = vzipq_s16(q0246_1.val[0], q1357_1.val[0]);
+    int16x8x2_t q_fin_3 = vzipq_s16(q0246_1.val[1], q1357_1.val[1]);
+
+    *v0 = q_fin_0.val[0];
+    *v1 = q_fin_0.val[1];
+    *v2 = q_fin_1.val[0];
+    *v3 = q_fin_1.val[1];
+    *v4 = q_fin_2.val[0];
+    *v5 = q_fin_2.val[1];
+    *v6 = q_fin_3.val[0];
+    *v7 = q_fin_3.val[1];
+}}
+#endif
+
+template
+inline void lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_type* lut_scales) {{
+#ifdef __ARM_NEON
+    int16x8_t vec_lut[16];
+    float32_t scales = *lut_scales;
+        uint8_t tbl_mask[16];
+        tbl_mask[0] = 0;
+        tbl_mask[1] = 2;
+        tbl_mask[2] = 4;
+        tbl_mask[3] = 6;
+        tbl_mask[4] = 8;
+        tbl_mask[5] = 10;
+        tbl_mask[6] = 12;
+        tbl_mask[7] = 14;
+        tbl_mask[8] = 1;
+        tbl_mask[9] = 3;
+        tbl_mask[10] = 5;
+        tbl_mask[11] = 7;
+        tbl_mask[12] = 9;
+        tbl_mask[13] = 11;
+        tbl_mask[14] = 13;
+        tbl_mask[15] = 15;
+        uint8x16_t tbl_mask_q = vld1q_u8(tbl_mask);
+#pragma unroll
+    for (int k = 0; k < act_k / 16; ++k) {{
+        float32x4x2_t vec_bs_x0 = vld2q_f32(b + k * 16);
+        float32x4x2_t vec_bs_x1 = vld2q_f32(b + k * 16 + 8);
+        float32x4_t vec_f_0 = vmulq_n_f32(vec_bs_x0.val[0], scales);
+        float32x4_t vec_f_1 = vmulq_n_f32(vec_bs_x0.val[1], scales);
+        float32x4_t vec_f_2 = vmulq_n_f32(vec_bs_x1.val[0], scales);
+        float32x4_t vec_f_3 = vmulq_n_f32(vec_bs_x1.val[1], scales);
+        int32x4_t vec_b_0 = vcvtnq_s32_f32(vec_f_0);
+        int32x4_t vec_b_1 = vcvtnq_s32_f32(vec_f_1);
+        int32x4_t vec_b_2 = vcvtnq_s32_f32(vec_f_2);
+        int32x4_t vec_b_3 = vcvtnq_s32_f32(vec_f_3);
+        int16x4_t vec_b16_0 = vmovn_s32(vec_b_0);
+        int16x4_t vec_b16_1 = vmovn_s32(vec_b_1);
+        int16x4_t vec_b16_2 = vmovn_s32(vec_b_2);
+        int16x4_t vec_b16_3 = vmovn_s32(vec_b_3);
+        int16x8_t vec_bs_0 = vcombine_s16(vec_b16_0, vec_b16_2);
+        int16x8_t vec_bs_1 = vcombine_s16(vec_b16_1, vec_b16_3);
+        vec_lut[0] = vdupq_n_s16(0);
+        vec_lut[0] = vec_lut[0] - vec_bs_0;
+        vec_lut[0] = vec_lut[0] - vec_bs_1;
+        vec_lut[1] = vdupq_n_s16(0);
+        vec_lut[1] = vec_lut[1] - vec_bs_0;
+        vec_lut[2] = vdupq_n_s16(0);
+        vec_lut[2] = vec_lut[2] - vec_bs_0;
+        vec_lut[2] = vec_lut[2] + vec_bs_1;
+        vec_lut[3] = vdupq_n_s16(0);
+        vec_lut[3] = vec_lut[3] - vec_bs_1;
+        vec_lut[4] = vdupq_n_s16(0);
+        vec_lut[5] = vec_bs_1;
+        vec_lut[6] = vec_bs_0;
+        vec_lut[6] = vec_lut[6] - vec_bs_1;
+        vec_lut[7] = vec_bs_0;
+        vec_lut[8] = vec_bs_0;
+        vec_lut[8] = vec_lut[8] + vec_bs_1;
+        Transpose_8_8(&(vec_lut[0]), &(vec_lut[1]), &(vec_lut[2]), &(vec_lut[3]),
+                      &(vec_lut[4]), &(vec_lut[5]), &(vec_lut[6]), &(vec_lut[7]));
+        Transpose_8_8(&(vec_lut[8]), &(vec_lut[9]), &(vec_lut[10]), &(vec_lut[11]),
+                      &(vec_lut[12]), &(vec_lut[13]), &(vec_lut[14]), &(vec_lut[15]));
+#pragma unroll
+        for (int idx = 0; idx < 8; idx++) {{
+            int8x16_t q0_s = vqtbl1q_s8(vreinterpretq_s8_s16(vec_lut[idx]), tbl_mask_q);
+            int8x8_t q0_low = vget_low_s8(q0_s);
+            int8x8_t q0_high = vget_high_s8(q0_s);
+            int8x16_t q1_s = vqtbl1q_s8(vreinterpretq_s8_s16(vec_lut[idx + 8]), tbl_mask_q);
+            int8x8_t q1_low = vget_low_s8(q1_s);
+            int8x8_t q1_high = vget_high_s8(q1_s);
+            vst1_s8(qlut + k * 16 * 8 * 2 + idx * 16 * 2, q0_high);
+            vst1_s8(qlut + k * 16 * 8 * 2 + idx * 16 * 2 + 8, q1_high);
+            vst1_s8(qlut + k * 16 * 8 * 2 + idx * 16 * 2 + 16, q0_low);
+            vst1_s8(qlut + k * 16 * 8 * 2 + idx * 16 * 2 + 24, q1_low);
+        }}
+    }}
+#endif
+}}
+
+static bool is_type_supported(enum ggml_type type) {{
+    if (type == GGML_TYPE_Q4_0 ||
+        type == GGML_TYPE_TL1) {{
+        return true;
+    }} else {{
+        return false;
+    }}
+}}
+#include 
+
+#define BM1536_4096 256
+#define BBK1536_4096 128
+inline void tbl_impl_1536_4096(int32_t* c, int8_t* lut, uint8_t* a) {
+#ifdef __ARM_NEON
+    const int KK = BBK1536_4096 / 2;
+    const uint8x16_t vec_mask = vdupq_n_u8(0x0f);
+    const int8x16_t vec_zero = vdupq_n_s16(0x0000);
+    int8x16_t vec_lut[2 * KK];
+    int16x8_t vec_c[4];
+#pragma unroll
+    for (int k = 0; k < 2 * KK; k++) {
+        vec_lut[k] = vld1q_s8(lut + k * 16);
+    }
+
+#pragma unroll
+    for (int i = 0; i < BM1536_4096; i += 32) {
+        #pragma unroll
+        for (int i=0; i<4; i++) {
+            vec_c[i] = vandq_s16(vec_c[i], vec_zero);
+        }
+
+#pragma unroll
+        for (int k = 0; k < KK / 4; k++) {
+            
+            uint8x16_t vec_a_0 = vld1q_u8(a + i * KK / 2 + k * 32 * 2 + 0 * 16);
+            uint8x16_t vec_a0_top = vshrq_n_u8(vec_a_0, 4);
+            uint8x16_t vec_a0_bot = vandq_u8(vec_a_0, vec_mask);
+            int8x16_t  vec_v_0_left_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 0], vec_a0_top);
+            int8x16_t  vec_v_0_left_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 1], vec_a0_top);
+            int8x16_t  vec_v_0_right_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 2], vec_a0_bot);
+            int8x16_t  vec_v_0_right_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 3], vec_a0_bot);
+            int8x16x2_t  vec_v_left_0 = vzipq_s8(vec_v_0_left_tmp1, vec_v_0_left_tmp0);
+            int8x16x2_t  vec_v_right_0 = vzipq_s8(vec_v_0_right_tmp1, vec_v_0_right_tmp0);
+            vec_c[0] += vec_v_left_0.val[0];
+            vec_c[0] += vec_v_right_0.val[0];
+            vec_c[1] += vec_v_left_0.val[1];
+            vec_c[1] += vec_v_right_0.val[1];
+        
+            uint8x16_t vec_a_1 = vld1q_u8(a + i * KK / 2 + k * 32 * 2 + 1 * 16);
+            uint8x16_t vec_a1_top = vshrq_n_u8(vec_a_1, 4);
+            uint8x16_t vec_a1_bot = vandq_u8(vec_a_1, vec_mask);
+            int8x16_t  vec_v_1_left_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 4], vec_a1_top);
+            int8x16_t  vec_v_1_left_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 5], vec_a1_top);
+            int8x16_t  vec_v_1_right_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 6], vec_a1_bot);
+            int8x16_t  vec_v_1_right_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 7], vec_a1_bot);
+            int8x16x2_t  vec_v_left_1 = vzipq_s8(vec_v_1_left_tmp1, vec_v_1_left_tmp0);
+            int8x16x2_t  vec_v_right_1 = vzipq_s8(vec_v_1_right_tmp1, vec_v_1_right_tmp0);
+            vec_c[0] += vec_v_left_1.val[0];
+            vec_c[0] += vec_v_right_1.val[0];
+            vec_c[1] += vec_v_left_1.val[1];
+            vec_c[1] += vec_v_right_1.val[1];
+        
+            uint8x16_t vec_a_2 = vld1q_u8(a + i * KK / 2 + k * 32 * 2 + 2 * 16);
+            uint8x16_t vec_a2_top = vshrq_n_u8(vec_a_2, 4);
+            uint8x16_t vec_a2_bot = vandq_u8(vec_a_2, vec_mask);
+            int8x16_t  vec_v_2_left_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 0], vec_a2_top);
+            int8x16_t  vec_v_2_left_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 1], vec_a2_top);
+            int8x16_t  vec_v_2_right_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 2], vec_a2_bot);
+            int8x16_t  vec_v_2_right_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 3], vec_a2_bot);
+            int8x16x2_t  vec_v_left_2 = vzipq_s8(vec_v_2_left_tmp1, vec_v_2_left_tmp0);
+            int8x16x2_t  vec_v_right_2 = vzipq_s8(vec_v_2_right_tmp1, vec_v_2_right_tmp0);
+            vec_c[2] += vec_v_left_2.val[0];
+            vec_c[2] += vec_v_right_2.val[0];
+            vec_c[3] += vec_v_left_2.val[1];
+            vec_c[3] += vec_v_right_2.val[1];
+        
+            uint8x16_t vec_a_3 = vld1q_u8(a + i * KK / 2 + k * 32 * 2 + 3 * 16);
+            uint8x16_t vec_a3_top = vshrq_n_u8(vec_a_3, 4);
+            uint8x16_t vec_a3_bot = vandq_u8(vec_a_3, vec_mask);
+            int8x16_t  vec_v_3_left_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 4], vec_a3_top);
+            int8x16_t  vec_v_3_left_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 5], vec_a3_top);
+            int8x16_t  vec_v_3_right_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 6], vec_a3_bot);
+            int8x16_t  vec_v_3_right_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 7], vec_a3_bot);
+            int8x16x2_t  vec_v_left_3 = vzipq_s8(vec_v_3_left_tmp1, vec_v_3_left_tmp0);
+            int8x16x2_t  vec_v_right_3 = vzipq_s8(vec_v_3_right_tmp1, vec_v_3_right_tmp0);
+            vec_c[2] += vec_v_left_3.val[0];
+            vec_c[2] += vec_v_right_3.val[0];
+            vec_c[3] += vec_v_left_3.val[1];
+            vec_c[3] += vec_v_right_3.val[1];
+        
+       }
+
+        int32x4_t vec_v_bot_low_low_0 = vmovl_s16(vget_low_s16(vec_c[0]));
+        int32x4_t vec_v_bot_low_high_0 = vmovl_high_s16(vec_c[0]);
+        vst1q_s32(c + i + 0, vld1q_s32(c + i + 0) + vec_v_bot_low_low_0);
+        vst1q_s32(c + i + 4, vld1q_s32(c + i + 4) + vec_v_bot_low_high_0);
+        int32x4_t vec_v_bot_low_low_1 = vmovl_s16(vget_low_s16(vec_c[1]));
+        int32x4_t vec_v_bot_low_high_1 = vmovl_high_s16(vec_c[1]);
+        vst1q_s32(c + i + 8, vld1q_s32(c + i + 8) + vec_v_bot_low_low_1);
+        vst1q_s32(c + i + 12, vld1q_s32(c + i + 12) + vec_v_bot_low_high_1);
+        int32x4_t vec_v_bot_low_low_2 = vmovl_s16(vget_low_s16(vec_c[2]));
+        int32x4_t vec_v_bot_low_high_2 = vmovl_high_s16(vec_c[2]);
+        vst1q_s32(c + i + 16, vld1q_s32(c + i + 16) + vec_v_bot_low_low_2);
+        vst1q_s32(c + i + 20, vld1q_s32(c + i + 20) + vec_v_bot_low_high_2);
+        int32x4_t vec_v_bot_low_low_3 = vmovl_s16(vget_low_s16(vec_c[3]));
+        int32x4_t vec_v_bot_low_high_3 = vmovl_high_s16(vec_c[3]);
+        vst1q_s32(c + i + 24, vld1q_s32(c + i + 24) + vec_v_bot_low_low_3);
+        vst1q_s32(c + i + 28, vld1q_s32(c + i + 28) + vec_v_bot_low_high_3);
+
+    }
+#endif
+}
+
+int32_t qgemm_lut_1536_4096(void* A, void* LUT, void* Scales, void* LUT_Scales, void* C) {
+    alignas(32) uint32_t CBits[BM1536_4096];
+    memset(&(CBits[0]), 0, BM1536_4096 * sizeof(int32_t));
+#pragma unroll
+    for (int32_t k_outer = 0; k_outer < 4096 / BBK1536_4096; ++k_outer) {
+        tbl_impl_1536_4096((&(((int32_t*)CBits)[0])), (&(((int8_t*)LUT)[(k_outer * BBK1536_4096 / 2 * 32)])), (&(((uint8_t*)A)[(k_outer * BBK1536_4096 / 2 / 2 * BM1536_4096)])));
+    }
+#pragma unroll
+    for (int i = 0; i < BM1536_4096; i++) {
+        ((bitnet_float_type*)C)[i] = (((int32_t*)CBits)[i]) / ((bitnet_float_type*)LUT_Scales)[0] * ((bitnet_float_type*)Scales)[0];
+    }
+  return 0;
+};
+#include 
+
+#define BM1536_1536 128
+#define BBK1536_1536 64
+inline void tbl_impl_1536_1536(int32_t* c, int8_t* lut, uint8_t* a) {
+#ifdef __ARM_NEON
+    const int KK = BBK1536_1536 / 2;
+    const uint8x16_t vec_mask = vdupq_n_u8(0x0f);
+    const int8x16_t vec_zero = vdupq_n_s16(0x0000);
+    int8x16_t vec_lut[2 * KK];
+    int16x8_t vec_c[8];
+#pragma unroll
+    for (int k = 0; k < 2 * KK; k++) {
+        vec_lut[k] = vld1q_s8(lut + k * 16);
+    }
+
+#pragma unroll
+    for (int i = 0; i < BM1536_1536; i += 64) {
+        #pragma unroll
+        for (int i=0; i<8; i++) {
+            vec_c[i] = vandq_s16(vec_c[i], vec_zero);
+        }
+
+#pragma unroll
+        for (int k = 0; k < KK / 2; k++) {
+            
+            uint8x16_t vec_a_0 = vld1q_u8(a + i * KK / 2 + k * 32 * 2 + 0 * 16);
+            uint8x16_t vec_a0_top = vshrq_n_u8(vec_a_0, 4);
+            uint8x16_t vec_a0_bot = vandq_u8(vec_a_0, vec_mask);
+            int8x16_t  vec_v_0_left_tmp0 = vqtbl1q_s8(vec_lut[4 * k + 0], vec_a0_top);
+            int8x16_t  vec_v_0_left_tmp1 = vqtbl1q_s8(vec_lut[4 * k + 1], vec_a0_top);
+            int8x16_t  vec_v_0_right_tmp0 = vqtbl1q_s8(vec_lut[4 * k + 2], vec_a0_bot);
+            int8x16_t  vec_v_0_right_tmp1 = vqtbl1q_s8(vec_lut[4 * k + 3], vec_a0_bot);
+            int8x16x2_t  vec_v_left_0 = vzipq_s8(vec_v_0_left_tmp1, vec_v_0_left_tmp0);
+            int8x16x2_t  vec_v_right_0 = vzipq_s8(vec_v_0_right_tmp1, vec_v_0_right_tmp0);
+            vec_c[0] += vec_v_left_0.val[0];
+            vec_c[0] += vec_v_right_0.val[0];
+            vec_c[1] += vec_v_left_0.val[1];
+            vec_c[1] += vec_v_right_0.val[1];
+        
+            uint8x16_t vec_a_1 = vld1q_u8(a + i * KK / 2 + k * 32 * 2 + 1 * 16);
+            uint8x16_t vec_a1_top = vshrq_n_u8(vec_a_1, 4);
+            uint8x16_t vec_a1_bot = vandq_u8(vec_a_1, vec_mask);
+            int8x16_t  vec_v_1_left_tmp0 = vqtbl1q_s8(vec_lut[4 * k + 0], vec_a1_top);
+            int8x16_t  vec_v_1_left_tmp1 = vqtbl1q_s8(vec_lut[4 * k + 1], vec_a1_top);
+            int8x16_t  vec_v_1_right_tmp0 = vqtbl1q_s8(vec_lut[4 * k + 2], vec_a1_bot);
+            int8x16_t  vec_v_1_right_tmp1 = vqtbl1q_s8(vec_lut[4 * k + 3], vec_a1_bot);
+            int8x16x2_t  vec_v_left_1 = vzipq_s8(vec_v_1_left_tmp1, vec_v_1_left_tmp0);
+            int8x16x2_t  vec_v_right_1 = vzipq_s8(vec_v_1_right_tmp1, vec_v_1_right_tmp0);
+            vec_c[2] += vec_v_left_1.val[0];
+            vec_c[2] += vec_v_right_1.val[0];
+            vec_c[3] += vec_v_left_1.val[1];
+            vec_c[3] += vec_v_right_1.val[1];
+        
+            uint8x16_t vec_a_2 = vld1q_u8(a + i * KK / 2 + k * 32 * 2 + 2 * 16);
+            uint8x16_t vec_a2_top = vshrq_n_u8(vec_a_2, 4);
+            uint8x16_t vec_a2_bot = vandq_u8(vec_a_2, vec_mask);
+            int8x16_t  vec_v_2_left_tmp0 = vqtbl1q_s8(vec_lut[4 * k + 0], vec_a2_top);
+            int8x16_t  vec_v_2_left_tmp1 = vqtbl1q_s8(vec_lut[4 * k + 1], vec_a2_top);
+            int8x16_t  vec_v_2_right_tmp0 = vqtbl1q_s8(vec_lut[4 * k + 2], vec_a2_bot);
+            int8x16_t  vec_v_2_right_tmp1 = vqtbl1q_s8(vec_lut[4 * k + 3], vec_a2_bot);
+            int8x16x2_t  vec_v_left_2 = vzipq_s8(vec_v_2_left_tmp1, vec_v_2_left_tmp0);
+            int8x16x2_t  vec_v_right_2 = vzipq_s8(vec_v_2_right_tmp1, vec_v_2_right_tmp0);
+            vec_c[4] += vec_v_left_2.val[0];
+            vec_c[4] += vec_v_right_2.val[0];
+            vec_c[5] += vec_v_left_2.val[1];
+            vec_c[5] += vec_v_right_2.val[1];
+        
+            uint8x16_t vec_a_3 = vld1q_u8(a + i * KK / 2 + k * 32 * 2 + 3 * 16);
+            uint8x16_t vec_a3_top = vshrq_n_u8(vec_a_3, 4);
+            uint8x16_t vec_a3_bot = vandq_u8(vec_a_3, vec_mask);
+            int8x16_t  vec_v_3_left_tmp0 = vqtbl1q_s8(vec_lut[4 * k + 0], vec_a3_top);
+            int8x16_t  vec_v_3_left_tmp1 = vqtbl1q_s8(vec_lut[4 * k + 1], vec_a3_top);
+            int8x16_t  vec_v_3_right_tmp0 = vqtbl1q_s8(vec_lut[4 * k + 2], vec_a3_bot);
+            int8x16_t  vec_v_3_right_tmp1 = vqtbl1q_s8(vec_lut[4 * k + 3], vec_a3_bot);
+            int8x16x2_t  vec_v_left_3 = vzipq_s8(vec_v_3_left_tmp1, vec_v_3_left_tmp0);
+            int8x16x2_t  vec_v_right_3 = vzipq_s8(vec_v_3_right_tmp1, vec_v_3_right_tmp0);
+            vec_c[6] += vec_v_left_3.val[0];
+            vec_c[6] += vec_v_right_3.val[0];
+            vec_c[7] += vec_v_left_3.val[1];
+            vec_c[7] += vec_v_right_3.val[1];
+        
+       }
+
+        int32x4_t vec_v_bot_low_low_0 = vmovl_s16(vget_low_s16(vec_c[0]));
+        int32x4_t vec_v_bot_low_high_0 = vmovl_high_s16(vec_c[0]);
+        vst1q_s32(c + i + 0, vld1q_s32(c + i + 0) + vec_v_bot_low_low_0);
+        vst1q_s32(c + i + 4, vld1q_s32(c + i + 4) + vec_v_bot_low_high_0);
+        int32x4_t vec_v_bot_low_low_1 = vmovl_s16(vget_low_s16(vec_c[1]));
+        int32x4_t vec_v_bot_low_high_1 = vmovl_high_s16(vec_c[1]);
+        vst1q_s32(c + i + 8, vld1q_s32(c + i + 8) + vec_v_bot_low_low_1);
+        vst1q_s32(c + i + 12, vld1q_s32(c + i + 12) + vec_v_bot_low_high_1);
+        int32x4_t vec_v_bot_low_low_2 = vmovl_s16(vget_low_s16(vec_c[2]));
+        int32x4_t vec_v_bot_low_high_2 = vmovl_high_s16(vec_c[2]);
+        vst1q_s32(c + i + 16, vld1q_s32(c + i + 16) + vec_v_bot_low_low_2);
+        vst1q_s32(c + i + 20, vld1q_s32(c + i + 20) + vec_v_bot_low_high_2);
+        int32x4_t vec_v_bot_low_low_3 = vmovl_s16(vget_low_s16(vec_c[3]));
+        int32x4_t vec_v_bot_low_high_3 = vmovl_high_s16(vec_c[3]);
+        vst1q_s32(c + i + 24, vld1q_s32(c + i + 24) + vec_v_bot_low_low_3);
+        vst1q_s32(c + i + 28, vld1q_s32(c + i + 28) + vec_v_bot_low_high_3);
+        int32x4_t vec_v_bot_low_low_4 = vmovl_s16(vget_low_s16(vec_c[4]));
+        int32x4_t vec_v_bot_low_high_4 = vmovl_high_s16(vec_c[4]);
+        vst1q_s32(c + i + 32, vld1q_s32(c + i + 32) + vec_v_bot_low_low_4);
+        vst1q_s32(c + i + 36, vld1q_s32(c + i + 36) + vec_v_bot_low_high_4);
+        int32x4_t vec_v_bot_low_low_5 = vmovl_s16(vget_low_s16(vec_c[5]));
+        int32x4_t vec_v_bot_low_high_5 = vmovl_high_s16(vec_c[5]);
+        vst1q_s32(c + i + 40, vld1q_s32(c + i + 40) + vec_v_bot_low_low_5);
+        vst1q_s32(c + i + 44, vld1q_s32(c + i + 44) + vec_v_bot_low_high_5);
+        int32x4_t vec_v_bot_low_low_6 = vmovl_s16(vget_low_s16(vec_c[6]));
+        int32x4_t vec_v_bot_low_high_6 = vmovl_high_s16(vec_c[6]);
+        vst1q_s32(c + i + 48, vld1q_s32(c + i + 48) + vec_v_bot_low_low_6);
+        vst1q_s32(c + i + 52, vld1q_s32(c + i + 52) + vec_v_bot_low_high_6);
+        int32x4_t vec_v_bot_low_low_7 = vmovl_s16(vget_low_s16(vec_c[7]));
+        int32x4_t vec_v_bot_low_high_7 = vmovl_high_s16(vec_c[7]);
+        vst1q_s32(c + i + 56, vld1q_s32(c + i + 56) + vec_v_bot_low_low_7);
+        vst1q_s32(c + i + 60, vld1q_s32(c + i + 60) + vec_v_bot_low_high_7);
+
+    }
+#endif
+}
+
+int32_t qgemm_lut_1536_1536(void* A, void* LUT, void* Scales, void* LUT_Scales, void* C) {
+    alignas(32) uint32_t CBits[BM1536_1536];
+    memset(&(CBits[0]), 0, BM1536_1536 * sizeof(int32_t));
+#pragma unroll
+    for (int32_t k_outer = 0; k_outer < 1536 / BBK1536_1536; ++k_outer) {
+        tbl_impl_1536_1536((&(((int32_t*)CBits)[0])), (&(((int8_t*)LUT)[(k_outer * BBK1536_1536 / 2 * 32)])), (&(((uint8_t*)A)[(k_outer * BBK1536_1536 / 2 / 2 * BM1536_1536)])));
+    }
+#pragma unroll
+    for (int i = 0; i < BM1536_1536; i++) {
+        ((bitnet_float_type*)C)[i] = (((int32_t*)CBits)[i]) / ((bitnet_float_type*)LUT_Scales)[0] * ((bitnet_float_type*)Scales)[0];
+    }
+  return 0;
+};
+#include 
+
+#define BM4096_1536 256
+#define BBK4096_1536 128
+inline void tbl_impl_4096_1536(int32_t* c, int8_t* lut, uint8_t* a) {
+#ifdef __ARM_NEON
+    const int KK = BBK4096_1536 / 2;
+    const uint8x16_t vec_mask = vdupq_n_u8(0x0f);
+    const int8x16_t vec_zero = vdupq_n_s16(0x0000);
+    int8x16_t vec_lut[2 * KK];
+    int16x8_t vec_c[4];
+#pragma unroll
+    for (int k = 0; k < 2 * KK; k++) {
+        vec_lut[k] = vld1q_s8(lut + k * 16);
+    }
+
+#pragma unroll
+    for (int i = 0; i < BM4096_1536; i += 32) {
+        #pragma unroll
+        for (int i=0; i<4; i++) {
+            vec_c[i] = vandq_s16(vec_c[i], vec_zero);
+        }
+
+#pragma unroll
+        for (int k = 0; k < KK / 4; k++) {
+            
+            uint8x16_t vec_a_0 = vld1q_u8(a + i * KK / 2 + k * 32 * 2 + 0 * 16);
+            uint8x16_t vec_a0_top = vshrq_n_u8(vec_a_0, 4);
+            uint8x16_t vec_a0_bot = vandq_u8(vec_a_0, vec_mask);
+            int8x16_t  vec_v_0_left_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 0], vec_a0_top);
+            int8x16_t  vec_v_0_left_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 1], vec_a0_top);
+            int8x16_t  vec_v_0_right_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 2], vec_a0_bot);
+            int8x16_t  vec_v_0_right_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 3], vec_a0_bot);
+            int8x16x2_t  vec_v_left_0 = vzipq_s8(vec_v_0_left_tmp1, vec_v_0_left_tmp0);
+            int8x16x2_t  vec_v_right_0 = vzipq_s8(vec_v_0_right_tmp1, vec_v_0_right_tmp0);
+            vec_c[0] += vec_v_left_0.val[0];
+            vec_c[0] += vec_v_right_0.val[0];
+            vec_c[1] += vec_v_left_0.val[1];
+            vec_c[1] += vec_v_right_0.val[1];
+        
+            uint8x16_t vec_a_1 = vld1q_u8(a + i * KK / 2 + k * 32 * 2 + 1 * 16);
+            uint8x16_t vec_a1_top = vshrq_n_u8(vec_a_1, 4);
+            uint8x16_t vec_a1_bot = vandq_u8(vec_a_1, vec_mask);
+            int8x16_t  vec_v_1_left_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 4], vec_a1_top);
+            int8x16_t  vec_v_1_left_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 5], vec_a1_top);
+            int8x16_t  vec_v_1_right_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 6], vec_a1_bot);
+            int8x16_t  vec_v_1_right_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 7], vec_a1_bot);
+            int8x16x2_t  vec_v_left_1 = vzipq_s8(vec_v_1_left_tmp1, vec_v_1_left_tmp0);
+            int8x16x2_t  vec_v_right_1 = vzipq_s8(vec_v_1_right_tmp1, vec_v_1_right_tmp0);
+            vec_c[0] += vec_v_left_1.val[0];
+            vec_c[0] += vec_v_right_1.val[0];
+            vec_c[1] += vec_v_left_1.val[1];
+            vec_c[1] += vec_v_right_1.val[1];
+        
+            uint8x16_t vec_a_2 = vld1q_u8(a + i * KK / 2 + k * 32 * 2 + 2 * 16);
+            uint8x16_t vec_a2_top = vshrq_n_u8(vec_a_2, 4);
+            uint8x16_t vec_a2_bot = vandq_u8(vec_a_2, vec_mask);
+            int8x16_t  vec_v_2_left_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 0], vec_a2_top);
+            int8x16_t  vec_v_2_left_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 1], vec_a2_top);
+            int8x16_t  vec_v_2_right_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 2], vec_a2_bot);
+            int8x16_t  vec_v_2_right_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 3], vec_a2_bot);
+            int8x16x2_t  vec_v_left_2 = vzipq_s8(vec_v_2_left_tmp1, vec_v_2_left_tmp0);
+            int8x16x2_t  vec_v_right_2 = vzipq_s8(vec_v_2_right_tmp1, vec_v_2_right_tmp0);
+            vec_c[2] += vec_v_left_2.val[0];
+            vec_c[2] += vec_v_right_2.val[0];
+            vec_c[3] += vec_v_left_2.val[1];
+            vec_c[3] += vec_v_right_2.val[1];
+        
+            uint8x16_t vec_a_3 = vld1q_u8(a + i * KK / 2 + k * 32 * 2 + 3 * 16);
+            uint8x16_t vec_a3_top = vshrq_n_u8(vec_a_3, 4);
+            uint8x16_t vec_a3_bot = vandq_u8(vec_a_3, vec_mask);
+            int8x16_t  vec_v_3_left_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 4], vec_a3_top);
+            int8x16_t  vec_v_3_left_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 5], vec_a3_top);
+            int8x16_t  vec_v_3_right_tmp0 = vqtbl1q_s8(vec_lut[8 * k + 6], vec_a3_bot);
+            int8x16_t  vec_v_3_right_tmp1 = vqtbl1q_s8(vec_lut[8 * k + 7], vec_a3_bot);
+            int8x16x2_t  vec_v_left_3 = vzipq_s8(vec_v_3_left_tmp1, vec_v_3_left_tmp0);
+            int8x16x2_t  vec_v_right_3 = vzipq_s8(vec_v_3_right_tmp1, vec_v_3_right_tmp0);
+            vec_c[2] += vec_v_left_3.val[0];
+            vec_c[2] += vec_v_right_3.val[0];
+            vec_c[3] += vec_v_left_3.val[1];
+            vec_c[3] += vec_v_right_3.val[1];
+        
+       }
+
+        int32x4_t vec_v_bot_low_low_0 = vmovl_s16(vget_low_s16(vec_c[0]));
+        int32x4_t vec_v_bot_low_high_0 = vmovl_high_s16(vec_c[0]);
+        vst1q_s32(c + i + 0, vld1q_s32(c + i + 0) + vec_v_bot_low_low_0);
+        vst1q_s32(c + i + 4, vld1q_s32(c + i + 4) + vec_v_bot_low_high_0);
+        int32x4_t vec_v_bot_low_low_1 = vmovl_s16(vget_low_s16(vec_c[1]));
+        int32x4_t vec_v_bot_low_high_1 = vmovl_high_s16(vec_c[1]);
+        vst1q_s32(c + i + 8, vld1q_s32(c + i + 8) + vec_v_bot_low_low_1);
+        vst1q_s32(c + i + 12, vld1q_s32(c + i + 12) + vec_v_bot_low_high_1);
+        int32x4_t vec_v_bot_low_low_2 = vmovl_s16(vget_low_s16(vec_c[2]));
+        int32x4_t vec_v_bot_low_high_2 = vmovl_high_s16(vec_c[2]);
+        vst1q_s32(c + i + 16, vld1q_s32(c + i + 16) + vec_v_bot_low_low_2);
+        vst1q_s32(c + i + 20, vld1q_s32(c + i + 20) + vec_v_bot_low_high_2);
+        int32x4_t vec_v_bot_low_low_3 = vmovl_s16(vget_low_s16(vec_c[3]));
+        int32x4_t vec_v_bot_low_high_3 = vmovl_high_s16(vec_c[3]);
+        vst1q_s32(c + i + 24, vld1q_s32(c + i + 24) + vec_v_bot_low_low_3);
+        vst1q_s32(c + i + 28, vld1q_s32(c + i + 28) + vec_v_bot_low_high_3);
+
+    }
+#endif
+}
+
+int32_t qgemm_lut_4096_1536(void* A, void* LUT, void* Scales, void* LUT_Scales, void* C) {
+    alignas(32) uint32_t CBits[BM4096_1536];
+    memset(&(CBits[0]), 0, BM4096_1536 * sizeof(int32_t));
+#pragma unroll
+    for (int32_t k_outer = 0; k_outer < 1536 / BBK4096_1536; ++k_outer) {
+        tbl_impl_4096_1536((&(((int32_t*)CBits)[0])), (&(((int8_t*)LUT)[(k_outer * BBK4096_1536 / 2 * 32)])), (&(((uint8_t*)A)[(k_outer * BBK4096_1536 / 2 / 2 * BM4096_1536)])));
+    }
+#pragma unroll
+    for (int i = 0; i < BM4096_1536; i++) {
+        ((bitnet_float_type*)C)[i] = (((int32_t*)CBits)[i]) / ((bitnet_float_type*)LUT_Scales)[0] * ((bitnet_float_type*)Scales)[0];
+    }
+  return 0;
+};
+
+template
+void preprocessor_k(void* B, void* LUT_Scales, void* QLUT) {{
+  partial_max_reset((&(((bitnet_float_type*)LUT_Scales)[0])));
+  per_tensor_quant(K, (&(((bitnet_float_type*)LUT_Scales)[0])), (&(((bitnet_float_type*)B)[0])));
+  
+  lut_ctor((&(((int8_t*)QLUT)[0])), (&(((bitnet_float_type*)B)[0])), (&(((bitnet_float_type*)LUT_Scales)[0])));
+}}
+void ggml_preprocessor(int m, int k, void* B, void* LUT_Scales, void* QLUT) {
+    if (m == 1536 && k == 4096) {
+        preprocessor_k<4096>(B, LUT_Scales, QLUT);
+    }
+    else if (m == 1536 && k == 1536) {
+        preprocessor_k<1536>(B, LUT_Scales, QLUT);
+    }
+    else if (m == 4096 && k == 1536) {
+        preprocessor_k<1536>(B, LUT_Scales, QLUT);
+    }
+}
+void ggml_qgemm_lut(int m, int k, void* A, void* LUT, void* Scales, void* LUT_Scales, void* C) {
+    if (m == 1536 && k == 4096) {
+        qgemm_lut_1536_4096(A, LUT, Scales, LUT_Scales, C);
+    }
+    else if (m == 1536 && k == 1536) {
+        qgemm_lut_1536_1536(A, LUT, Scales, LUT_Scales, C);
+    }
+    else if (m == 4096 && k == 1536) {
+        qgemm_lut_4096_1536(A, LUT, Scales, LUT_Scales, C);
+    }
+}
+
+void ggml_bitnet_transform_tensor(struct ggml_tensor * tensor) {
+    if (!(is_type_supported(tensor->type) && tensor->backend == GGML_BACKEND_TYPE_CPU && tensor->extra == nullptr)) {
+        return;
+    }
+
+    int k = tensor->ne[0];
+    int m = tensor->ne[1];
+    const int lut_scales_size = 1;
+    const int scales_size = 1;
+    int bk = 0;
+    int bm = 0;
+
+    if (m == 1536 && k == 4096) {
+        bm = BM1536_4096;
+        bk = BBK1536_4096;
+    }
+else if (m == 1536 && k == 1536) {
+        bm = BM1536_1536;
+        bk = BBK1536_1536;
+    }
+else if (m == 4096 && k == 1536) {
+        bm = BM4096_1536;
+        bk = BBK4096_1536;
+    }
+
+    const int n_tile_num = m / bm;
+    const int BK = bk;
+    uint8_t * qweights;
+    bitnet_float_type * scales;
+
+    scales = (bitnet_float_type *) aligned_malloc(sizeof(bitnet_float_type));
+    qweights = (uint8_t *) tensor->data;
+    float * i2_scales = (float * )(qweights + k * m / 4);
+    scales[0] = (bitnet_float_type) i2_scales[0];
+
+    tensor->extra = bitnet_tensor_extras + bitnet_tensor_extras_index;
+    bitnet_tensor_extras[bitnet_tensor_extras_index++] = {
+        /* .lut_scales_size = */ lut_scales_size,
+        /* .BK              = */ BK,
+        /* .n_tile_num      = */ n_tile_num,
+        /* .qweights        = */ qweights,
+        /* .scales          = */ scales
+    };
+}
+#endif
\ No newline at end of file
diff --git a/include/ggml-bitnet.h b/include/ggml-bitnet.h
index 3f8571cc6..bf373dfab 100644
--- a/include/ggml-bitnet.h
+++ b/include/ggml-bitnet.h
@@ -5,8 +5,13 @@
 
 #ifdef __ARM_NEON
 #include 
+#if defined(GGML_BITNET_ARM_TL1)
 typedef float32_t bitnet_float_type;
 #else
+typedef float16_t bitnet_float_type;
+#endif
+#else
+#include 
 typedef float bitnet_float_type;
 #endif
 
@@ -43,6 +48,10 @@ GGML_API void ggml_preprocessor(int m, int k, void* B, void* LUT_Scales, void* Q
 GGML_API void ggml_qgemm_lut(int bs, int m, int k, int BK, void* A, void* sign, void* LUT, void* Scales, void* LUT_Scales, void* C);
 GGML_API void ggml_preprocessor(int bs, int m, int three_k, int two_k, void* B, void* LUT_Scales, void* Three_QLUT, void* Two_QLUT);
 #endif
+#if defined(GGML_BITNET_TL2_LOSS)
+GGML_API void ggml_qgemm_lut(int bs, int m, int k, int BK, void* A, void* sign, void* LUT, void* Scales, void* LUT_Scales, void* C);
+GGML_API void ggml_preprocessor(int bs, int m, int three_k, int two_k, void* B, void* Three_LUT_Scales, void* Two_LUT_Scales, void* Three_QLUT, void* Two_QLUT);
+#endif
 
 #ifdef  __cplusplus
 }
diff --git a/include/kernel_config.ini b/include/kernel_config.ini
new file mode 100644
index 000000000..5d94318da
--- /dev/null
+++ b/include/kernel_config.ini
@@ -0,0 +1,21 @@
+[Kernels_0]
+m = 1536
+k = 4096
+bm = 256
+bk = 128
+bmm = 32
+
+[Kernels_1]
+m = 1536
+k = 1536
+bm = 128
+bk = 64
+bmm = 64
+
+[Kernels_2]
+m = 4096
+k = 1536
+bm = 256
+bk = 128
+bmm = 32
+
diff --git a/setup_env.py b/setup_env.py
index 9256324fb..12d510029 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -44,8 +44,8 @@
 }
 
 SUPPORTED_QUANT_TYPES = {
-    "arm64": ["i2_s", "tl1"],
-    "x86_64": ["i2_s", "tl2"]
+    "arm64": ["i2_s", "tl1", "tl2-loss"],
+    "x86_64": ["i2_s", "tl2", "tl2-loss"]
 }
 
 COMPILER_EXTRA_ARGS = {
@@ -111,8 +111,10 @@ def prepare_model():
     gguf_path = os.path.join(model_dir, "ggml-model-" + quant_type + ".gguf")
     if not os.path.exists(gguf_path) or os.path.getsize(gguf_path) == 0:
         logging.info(f"Converting HF model to GGUF format...")
-        if quant_type.startswith("tl"):
+        if quant_type in ["tl1", "tl2"]:
             run_command([sys.executable, "utils/convert-hf-to-gguf-bitnet.py", model_dir, "--outtype", quant_type, "--quant-embd"], log_step="convert_to_tl")
+        elif quant_type in ["tl2-loss"]:
+            run_command([sys.executable, "utils/convert-hf-to-gguf-bitnet.py", model_dir, "--outtype", "tl2", "--quant-embd", "--loss", "--outfile", model_dir + str("/ggml-model-tl2-loss.gguf")], log_step="convert_to_tl")
         else: # i2s
             # convert to f32
             run_command([sys.executable, "utils/convert-hf-to-gguf-bitnet.py", model_dir, "--outtype", "f32"], log_step="convert_to_f32_gguf")
@@ -156,11 +158,20 @@ def gen_code():
                 shutil.copyfile(os.path.join(pretuned_kernels, "bitnet-lut-kernels-tl2.h"), "include/bitnet-lut-kernels.h")
                 shutil.copyfile(os.path.join(pretuned_kernels, "kernel_config_tl2.ini"), "include/kernel_config.ini")
         if get_model_name() == "bitnet_b1_58-large":
-            run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "128,64,128", "--bm", "32,64,32"], log_step="codegen")
+            if args.quant_type == "tl2-loss":
+                run_command([sys.executable, "utils/codegen_tl2_loss.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
+            else:
+                run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "128,64,128", "--bm", "32,64,32"], log_step="codegen")
         elif get_model_name() in llama3_f3_models:
-            run_command([sys.executable, "utils/codegen_tl1.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "128,64,128,64", "--bm", "32,64,32,64"], log_step="codegen")
+            if args.quant_type == "tl2-loss":
+                run_command([sys.executable, "utils/codegen_tl2_loss.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
+            else:
+                run_command([sys.executable, "utils/codegen_tl1.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "128,64,128,64", "--bm", "32,64,32,64"], log_step="codegen")
         elif get_model_name() == "bitnet_b1_58-3B":
-            run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "64,128,64", "--bm", "32,64,32"], log_step="codegen")
+            if args.quant_type == "tl2-loss":
+                run_command([sys.executable, "utils/codegen_tl2_loss.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
+            else:
+                run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "64,128,64", "--bm", "32,64,32"], log_step="codegen")
         else:
             raise NotImplementedError()
     else:
@@ -172,11 +183,20 @@ def gen_code():
                 sys.exit(1)
             shutil.copyfile(os.path.join(pretuned_kernels, "bitnet-lut-kernels-tl2.h"), "include/bitnet-lut-kernels.h")
         if get_model_name() == "bitnet_b1_58-large":
-            run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "96,192,96", "--bm", "32,32,32"], log_step="codegen")
+            if args.quant_type == "tl2-loss":
+                run_command([sys.executable, "utils/codegen_tl2_loss.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
+            else:
+                run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "96,192,96", "--bm", "32,32,32"], log_step="codegen")
         elif get_model_name() in llama3_f3_models:
-            run_command([sys.executable, "utils/codegen_tl2.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "96,96,96,96", "--bm", "32,32,32,32"], log_step="codegen")
+            if args.quant_type == "tl2-loss":
+                run_command([sys.executable, "utils/codegen_tl2_loss.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
+            else:
+                run_command([sys.executable, "utils/codegen_tl2.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "96,96,96,96", "--bm", "32,32,32,32"], log_step="codegen")
         elif get_model_name() == "bitnet_b1_58-3B":
-            run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
+            if args.quant_type == "tl2-loss":
+                run_command([sys.executable, "utils/codegen_tl2_loss.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
+            else:
+                run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
         else:
             raise NotImplementedError()
 
@@ -192,7 +212,10 @@ def compile():
         logging.error(f"Arch {arch} is not supported yet")
         exit(0)
     logging.info("Compiling the code using CMake.")
-    run_command(["cmake", "-B", "build", *COMPILER_EXTRA_ARGS[arch], *OS_EXTRA_ARGS.get(platform.system(), [])], log_step="generate_build_files")
+    if args.quant_type == "tl2-loss":
+        run_command(["cmake", "-B", "build", "-DBITNET_TL2_LOSS=ON", *OS_EXTRA_ARGS.get(platform.system(), [])], log_step="generate_build_files")
+    else:
+        run_command(["cmake", "-B", "build", *COMPILER_EXTRA_ARGS[arch], *OS_EXTRA_ARGS.get(platform.system(), [])], log_step="generate_build_files")
     # run_command(["cmake", "--build", "build", "--target", "llama-cli", "--config", "Release"])
     run_command(["cmake", "--build", "build", "--config", "Release"], log_step="compile")
 
diff --git a/src/ggml-bitnet-lut.cpp b/src/ggml-bitnet-lut.cpp
index 59422d548..680cdab35 100644
--- a/src/ggml-bitnet-lut.cpp
+++ b/src/ggml-bitnet-lut.cpp
@@ -154,6 +154,80 @@ size_t ggml_bitnet_mul_mat_get_wsize(const struct ggml_tensor * src0, const stru
     return wsize;
 }
 
+int ggml_bitnet_get_type_bits(enum ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_TL2:
+            return 2;
+        case GGML_TYPE_Q4_0:
+            return 4;
+        default:
+            return 0;
+    }
+}
+#endif
+
+#if defined(GGML_BITNET_TL2_LOSS)
+void ggml_bitnet_init(void) {
+    // LOG(INFO) << "ggml_bitnet_init";
+
+    if (initialized) {
+        return;
+    }
+    initialized = true;
+
+    // if (wrapper == nullptr) {
+    //     wrapper = new BITNET::BITNETGeMMWrapper();
+    // }
+    if (bitnet_tensor_extras == nullptr) {
+        bitnet_tensor_extras = new bitnet_tensor_extra[GGML_BITNET_MAX_NODES];
+    }
+    bitnet_tensor_extras_index = 0;
+}
+
+void ggml_bitnet_free(void) {
+    // LOG(INFO) << "ggml_bitnet_free";
+
+    if (!initialized) {
+        return;
+    }
+    initialized = false;
+
+    // delete wrapper;
+    // wrapper = nullptr;
+    for (size_t i = 0; i < bitnet_tensor_extras_index; i++) {
+        // aligned_free(bitnet_tensor_extras[i].qweights);
+        // aligned_free(bitnet_tensor_extras[i].scales);
+    }
+    delete[] bitnet_tensor_extras;
+    bitnet_tensor_extras = nullptr;
+}
+
+bool ggml_bitnet_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
+    if ((is_type_supported(src0->type)) &&
+        src1->type == GGML_TYPE_F32 &&
+        dst->type == GGML_TYPE_F32 &&
+        src0->backend == GGML_BACKEND_TYPE_CPU) {
+        if (src1->ne[1] <= 1) {
+            return true;
+        }
+    }
+    return false;
+}
+
+size_t ggml_bitnet_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
+    const size_t ne01 = src0->ne[1];
+    const size_t ne10 = src1->ne[0];
+    const size_t ne11 = src1->ne[1];
+    
+    size_t wsize = ne10 * ne11 * 11 * sizeof(int8_t) + 2 * ne11 * 2 * sizeof(bitnet_float_type);
+    if (sizeof(bitnet_float_type) == 2) {
+        // Need fp32 to fp16 conversion
+        wsize += std::max(ne10, ne01) * ne11 * sizeof(bitnet_float_type);
+    }
+    wsize = ((wsize - 1) / 64 + 1) * 64;
+    return wsize;
+}
+
 int ggml_bitnet_get_type_bits(enum ggml_type type) {
     switch (type) {
         case GGML_TYPE_TL2:
diff --git a/utils/codegen_tl2_loss.py b/utils/codegen_tl2_loss.py
new file mode 100644
index 000000000..ced99032a
--- /dev/null
+++ b/utils/codegen_tl2_loss.py
@@ -0,0 +1,1056 @@
+import argparse
+import os
+from configparser import ConfigParser
+
+def gen_ctor_code():
+    kernel_code = "\n\
+#include \"ggml-bitnet.h\"\n\
+#include \n\
+#define GGML_BITNET_MAX_NODES 8192\n\
+static bool initialized = false;\n\
+static bitnet_tensor_extra * bitnet_tensor_extras = nullptr;\n\
+static size_t bitnet_tensor_extras_index = 0;\n\
+static void * aligned_malloc(size_t size) {\n\
+#if defined(_WIN32)\n\
+    return _aligned_malloc(size, 64);\n\
+#else\n\
+    void * ptr = nullptr;\n\
+    posix_memalign(&ptr, 64, size);\n\
+    return ptr;\n\
+#endif\n\
+}\n\
+\n\
+static void aligned_free(void * ptr) {\n\
+#if defined(_WIN32)\n\
+    _aligned_free(ptr);\n\
+#else\n\
+    free(ptr);\n\
+#endif\n\
+}\n\
+#define BK2 32\n\
+#if defined __AVX2__\n\
+inline void _mm256_merge_epi32(const __m256i v0, const __m256i v1, __m256i *vl, __m256i *vh)\n\
+{\n\
+    __m256i va = _mm256_permute4x64_epi64(v0, _MM_SHUFFLE(3, 1, 2, 0));\n\
+    __m256i vb = _mm256_permute4x64_epi64(v1, _MM_SHUFFLE(3, 1, 2, 0));\n\
+    *vl = _mm256_unpacklo_epi32(va, vb);\n\
+    *vh = _mm256_unpackhi_epi32(va, vb);\n\
+}\n\
+inline void _mm256_merge_epi64(const __m256i v0, const __m256i v1, __m256i *vl, __m256i *vh)\n\
+{\n\
+    __m256i va = _mm256_permute4x64_epi64(v0, _MM_SHUFFLE(3, 1, 2, 0));\n\
+    __m256i vb = _mm256_permute4x64_epi64(v1, _MM_SHUFFLE(3, 1, 2, 0));\n\
+    *vl = _mm256_unpacklo_epi64(va, vb);\n\
+    *vh = _mm256_unpackhi_epi64(va, vb);\n\
+}\n\
+inline void _mm256_merge_si128(const __m256i v0, const __m256i v1, __m256i *vl, __m256i *vh)\n\
+{\n\
+    *vl = _mm256_permute2x128_si256(v0, v1, _MM_SHUFFLE(0, 2, 0, 0));\n\
+    *vh = _mm256_permute2x128_si256(v0, v1, _MM_SHUFFLE(0, 3, 0, 1));\n\
+}\n\
+inline void Transpose_8_8(\n\
+    __m256i *v0,\n\
+    __m256i *v1,\n\
+    __m256i *v2,\n\
+    __m256i *v3,\n\
+    __m256i *v4,\n\
+    __m256i *v5,\n\
+    __m256i *v6,\n\
+    __m256i *v7)\n\
+{\n\
+    __m256i w0, w1, w2, w3, w4, w5, w6, w7;\n\
+    __m256i x0, x1, x2, x3, x4, x5, x6, x7;\n\
+    _mm256_merge_epi32(*v0, *v1, &w0, &w1);\n\
+    _mm256_merge_epi32(*v2, *v3, &w2, &w3);\n\
+    _mm256_merge_epi32(*v4, *v5, &w4, &w5);\n\
+    _mm256_merge_epi32(*v6, *v7, &w6, &w7);\n\
+    _mm256_merge_epi64(w0, w2, &x0, &x1);\n\
+    _mm256_merge_epi64(w1, w3, &x2, &x3);\n\
+    _mm256_merge_epi64(w4, w6, &x4, &x5);\n\
+    _mm256_merge_epi64(w5, w7, &x6, &x7);\n\
+    _mm256_merge_si128(x0, x4, v0, v1);\n\
+    _mm256_merge_si128(x1, x5, v2, v3);\n\
+    _mm256_merge_si128(x2, x6, v4, v5);\n\
+    _mm256_merge_si128(x3, x7, v6, v7);\n\
+}\n\
+#elif defined __ARM_NEON\n\
+inline void Transpose_8_8(\n\
+    int8x8_t *v0,\n\
+    int8x8_t *v1,\n\
+    int8x8_t *v2,\n\
+    int8x8_t *v3,\n\
+    int8x8_t *v4,\n\
+    int8x8_t *v5,\n\
+    int8x8_t *v6,\n\
+    int8x8_t *v7)\n\
+{\n\
+    int8x8x2_t q04 = vzip_s8(*v0, *v4);\n\
+    int8x8x2_t q15 = vzip_s8(*v1, *v5);\n\
+    int8x8x2_t q26 = vzip_s8(*v2, *v6);\n\
+    int8x8x2_t q37 = vzip_s8(*v3, *v7);\n\
+    int8x8x2_t q0246_0 = vzip_s8(q04.val[0], q26.val[0]);\n\
+    int8x8x2_t q0246_1 = vzip_s8(q04.val[1], q26.val[1]);\n\
+    int8x8x2_t q1357_0 = vzip_s8(q15.val[0], q37.val[0]);\n\
+    int8x8x2_t q1357_1 = vzip_s8(q15.val[1], q37.val[1]);\n\
+    int8x8x2_t q_fin_0 = vzip_s8(q0246_0.val[0], q1357_0.val[0]);\n\
+    int8x8x2_t q_fin_1 = vzip_s8(q0246_0.val[1], q1357_0.val[1]);\n\
+    int8x8x2_t q_fin_2 = vzip_s8(q0246_1.val[0], q1357_1.val[0]);\n\
+    int8x8x2_t q_fin_3 = vzip_s8(q0246_1.val[1], q1357_1.val[1]);\n\
+    *v0 = q_fin_0.val[0];\n\
+    *v1 = q_fin_0.val[1];\n\
+    *v2 = q_fin_1.val[0];\n\
+    *v3 = q_fin_1.val[1];\n\
+    *v4 = q_fin_2.val[0];\n\
+    *v5 = q_fin_2.val[1];\n\
+    *v6 = q_fin_3.val[0];\n\
+    *v7 = q_fin_3.val[1];\n\
+}\n\
+#endif\n\
+inline int32_t two_partial_max(void* lut_scales_, void* b_) {\n\
+    bitnet_float_type* lut_scales = (bitnet_float_type*)lut_scales_;\n\
+    bitnet_float_type* b = (bitnet_float_type*)b_;\n\
+#if defined __AVX2__\n\
+    const __m256i vec_bi = _mm256_set_epi32(56, 48, 40, 32, 24, 16, 8, 0);\n\
+    __m256 vec_b0 = _mm256_i32gather_ps(b + 0, vec_bi, 1);\n\
+    __m256 vec_b1 = _mm256_i32gather_ps(b + 1, vec_bi, 1);\n\
+    const __m256 vec_sign = _mm256_set1_ps(-0.0f);\n\
+    __m256 vec_babs0 = _mm256_andnot_ps(vec_sign, vec_b0);\n\
+    __m256 vec_babs1 = _mm256_andnot_ps(vec_sign, vec_b1);\n\
+    __m256 abssum = _mm256_add_ps(vec_babs0, vec_babs1);\n\
+    __m128 max2 = _mm_max_ps(_mm256_extractf128_ps(abssum, 1), _mm256_castps256_ps128(abssum));\n\
+    max2 = _mm_max_ps(max2, _mm_movehl_ps(max2, max2));\n\
+    max2 = _mm_max_ss(max2, _mm_movehdup_ps(max2));\n\
+    bitnet_float_type scales = _mm_cvtss_f32(max2) / 127;\n\
+    *lut_scales = std::max(*lut_scales, scales);\n\
+#elif defined __ARM_NEON\n\
+    float16x8x2_t vec_bs = vld2q_f16(b);\n\
+    float16x8_t abssum = vabsq_f16(vec_bs.val[0]) + vabsq_f16(vec_bs.val[1]);\n\
+    float16_t scales = vmaxvq_f16(abssum) / 127;\n\
+    *lut_scales = std::max(*lut_scales, scales);\n\
+#endif\n\
+    return 0;\n\
+}\n\
+inline int32_t three_partial_max(void* lut_scales_, void* b_) {\n\
+    bitnet_float_type* lut_scales = (bitnet_float_type*)lut_scales_;\n\
+    bitnet_float_type* b = (bitnet_float_type*)b_;\n\
+#if defined __AVX2__\n\
+    const __m256i vec_bi = _mm256_set_epi32(84, 72, 60, 48, 36, 24, 12, 0);\n\
+    __m256 vec_b0 = _mm256_i32gather_ps(b + 0, vec_bi, 1);\n\
+    __m256 vec_b1 = _mm256_i32gather_ps(b + 1, vec_bi, 1);\n\
+    __m256 vec_b2 = _mm256_i32gather_ps(b + 2, vec_bi, 1);\n\
+    const __m256 vec_sign = _mm256_set1_ps(-0.0f);\n\
+    __m256 vec_babs0 = _mm256_andnot_ps(vec_sign, vec_b0);\n\
+    __m256 vec_babs1 = _mm256_andnot_ps(vec_sign, vec_b1);\n\
+    __m256 vec_babs2 = _mm256_andnot_ps(vec_sign, vec_b2);\n\
+    __m256 abssum = _mm256_add_ps(_mm256_add_ps(vec_babs0, vec_babs1), vec_babs2);\n\
+    __m128 max3 = _mm_max_ps(_mm256_extractf128_ps(abssum, 1), _mm256_castps256_ps128(abssum));\n\
+    max3 = _mm_max_ps(max3, _mm_movehl_ps(max3, max3));\n\
+    max3 = _mm_max_ss(max3, _mm_movehdup_ps(max3));\n\
+    bitnet_float_type scales = _mm_cvtss_f32(max3) / 127;\n\
+    *lut_scales = std::max(*lut_scales, scales);\n\
+#elif defined __ARM_NEON\n\
+    float16x8x3_t vec_bs = vld3q_f16(b);\n\
+    float16x8_t abssum = vabsq_f16(vec_bs.val[0]) + vabsq_f16(vec_bs.val[1]) + vabsq_f16(vec_bs.val[2]);\n\
+    float16_t scales = vmaxvq_f16(abssum) / 127;\n\
+    *lut_scales = std::max(*lut_scales, scales);\n\
+#endif\n\
+    return 0;\n\
+}\n\
+inline int32_t partial_max_reset(int32_t bs, void* lut_scales_) {\n\
+    bitnet_float_type* lut_scales = (bitnet_float_type*)lut_scales_;\n\
+    #pragma unroll\n\
+    for (int i=0; i< bs; i++) {\n\
+        lut_scales[i] = 0.0;\n\
+    }\n\
+    return 0;\n\
+}\n\
+template\n\
+inline int32_t three_lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_type* lut_scales) {\n\
+#if defined __AVX2__\n\
+    __m256 vec_lut[16];\n\
+    const __m256i vec_bi = _mm256_set_epi32(84, 72, 60, 48, 36, 24, 12, 0);\n\
+    bitnet_float_type scales = *lut_scales;\n\
+    bitnet_float_type t_scales = scales ? 1.0f / scales : 0.0f;\n\
+#pragma unroll\n\
+    for (int k = 0; k < act_k / 24; ++k) {\n\
+        __m256 vec_b0 = _mm256_i32gather_ps(b + k * 24 + 0, vec_bi, 1);\n\
+        __m256 vec_b1 = _mm256_i32gather_ps(b + k * 24 + 1, vec_bi, 1);\n\
+        __m256 vec_b2 = _mm256_i32gather_ps(b + k * 24 + 2, vec_bi, 1);\n\
+\n\
+        vec_lut[15] = _mm256_setzero_ps();\n\
+        vec_lut[14] = _mm256_setzero_ps();\n\
+        vec_lut[13] = vec_b0;\n\
+        vec_lut[13] = _mm256_add_ps(vec_lut[13], vec_b1);\n\
+        vec_lut[13] = _mm256_add_ps(vec_lut[13], vec_b2);\n\
+        vec_lut[12] = vec_b0;\n\
+        vec_lut[12] = _mm256_add_ps(vec_lut[12], vec_b1);\n\
+        vec_lut[11] = vec_b0;\n\
+        vec_lut[11] = _mm256_add_ps(vec_lut[11], vec_b1);\n\
+        vec_lut[11] = _mm256_sub_ps(vec_lut[11], vec_b2);\n\
+        vec_lut[10] = vec_b0;\n\
+        vec_lut[10] = _mm256_add_ps(vec_lut[10], vec_b2);\n\
+        vec_lut[9] = vec_b0;\n\
+        vec_lut[8] = vec_b0;\n\
+        vec_lut[8] = _mm256_sub_ps(vec_lut[8], vec_b2);\n\
+        vec_lut[7] = vec_b0;\n\
+        vec_lut[7] = _mm256_sub_ps(vec_lut[7], vec_b1);\n\
+        vec_lut[7] = _mm256_add_ps(vec_lut[7], vec_b2);\n\
+        vec_lut[6] = vec_b0;\n\
+        vec_lut[6] = _mm256_sub_ps(vec_lut[6], vec_b1);\n\
+        vec_lut[5] = vec_b0;\n\
+        vec_lut[5] = _mm256_sub_ps(vec_lut[5], vec_b1);\n\
+        vec_lut[5] = _mm256_sub_ps(vec_lut[5], vec_b2);\n\
+        vec_lut[4] = vec_b1;\n\
+        vec_lut[4] = _mm256_add_ps(vec_lut[4], vec_b2);\n\
+        vec_lut[3] = vec_b1;\n\
+        vec_lut[2] = vec_b1;\n\
+        vec_lut[2] = _mm256_sub_ps(vec_lut[2], vec_b2);\n\
+        vec_lut[1] = vec_b2;\n\
+        vec_lut[0] = _mm256_setzero_ps();\n\
+\n\
+#pragma unroll\n\
+        for (int g = 0; g < 14; ++g) {\n\
+            vec_lut[g] = _mm256_mul_ps(vec_lut[g], _mm256_set1_ps(t_scales));\n\
+        }\n\
+        __m256i ix[16];\n\
+        for (int g = 0; g < 14; ++g) {\n\
+            ix[g] = _mm256_cvtps_epi32(_mm256_round_ps(vec_lut[g], _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));\n\
+        }\n\
+        __m256i shuffle_mask = _mm256_set_epi8(\n\
+                                               0x0f, 0x0e, 0x0d, 0x0c, 0x07, 0x06, 0x05, 0x04,\n\
+                                               0x0b, 0x0a, 0x09, 0x08, 0x03, 0x02, 0x01, 0x00,\n\
+                                               0x0f, 0x0e, 0x0d, 0x0c, 0x07, 0x06, 0x05, 0x04,\n\
+                                               0x0b, 0x0a, 0x09, 0x08, 0x03, 0x02, 0x01, 0x00\n\
+                                               );\n\
+        Transpose_8_8(&(ix[0]), &(ix[1]), &(ix[2]), &(ix[3]), &(ix[4]), &(ix[5]),&(ix[6]), &(ix[7]));\n\
+        Transpose_8_8(&(ix[8]), &(ix[9]), &(ix[10]), &(ix[11]), &(ix[12]), &(ix[13]),&(ix[14]), &(ix[15]));\n\
+        int8_t* qlut_i8 = reinterpret_cast(qlut);\n\
+#pragma unroll\n\
+        for (int g = 0; g < 8; ++g) {\n\
+            ix[g] = _mm256_packs_epi32(ix[g], ix[g + 8]);\n\
+            ix[g] = _mm256_packs_epi16(ix[g], ix[g]);\n\
+            ix[g] = _mm256_permute4x64_epi64(ix[g], _MM_SHUFFLE(3, 1, 2, 0));\n\
+            ix[g] = _mm256_shuffle_epi8(ix[g], shuffle_mask);\n\
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(qlut_i8 + k * 128 + g * 16 + 0), _mm256_castsi256_si128(ix[g]));\n\
+        }\n\
+    }\n\
+\n\
+    *lut_scales = scales;\n\
+#elif defined __ARM_NEON\n\
+    float16x8_t vec_lut[16];\n\
+    float16_t scales = *lut_scales;\n\
+    float16_t t_scales = scales ? 1.0 / scales : 0.0;\n\
+#pragma unroll\n\
+    for (int k = 0; k < act_k / 24; ++k) {\n\
+        float16x8x3_t vec_bs = vld3q_f16(b + k * 24);\n\
+        vec_lut[15] = vdupq_n_f16(0);\n\
+        vec_lut[14] = vdupq_n_f16(0);\n\
+        vec_lut[13] = vec_bs.val[0] + vec_bs.val[1] + vec_bs.val[2];\n\
+        vec_lut[12] = vec_bs.val[0] + vec_bs.val[1];\n\
+        vec_lut[11] = vec_bs.val[0] + vec_bs.val[1] - vec_bs.val[2];\n\
+        vec_lut[10] = vec_bs.val[0] + vec_bs.val[2];\n\
+        vec_lut[9] = vec_bs.val[0];\n\
+        vec_lut[8] = vec_bs.val[0] - vec_bs.val[2];\n\
+        vec_lut[7] = vec_bs.val[0] - vec_bs.val[1] + vec_bs.val[2];\n\
+        vec_lut[6] = vec_bs.val[0] - vec_bs.val[1];\n\
+        vec_lut[5] = vec_bs.val[0] - vec_bs.val[1] - vec_bs.val[2];\n\
+        vec_lut[4] = vec_bs.val[1] + vec_bs.val[2];\n\
+        vec_lut[3] = vec_bs.val[1];\n\
+        vec_lut[2] = vec_bs.val[1] - vec_bs.val[2];\n\
+        vec_lut[1] = vec_bs.val[2];\n\
+        vec_lut[0] = vdupq_n_f16(0);\n\
+\n\
+#pragma unroll\n\
+        for (int g = 0; g < 14; ++g) {\n\
+            vec_lut[g] = vmulq_n_f16(vec_lut[g], t_scales);\n\
+        }\n\
+\n\
+        int8x8_t vec_qlut[16];\n\
+#pragma unroll\n\
+        for (int g = 0; g < 14; ++g) {\n\
+            vec_qlut[g] = vqmovn_s16(vcvtnq_s16_f16(vec_lut[g]));\n\
+        }\n\
+        Transpose_8_8(&(vec_qlut[0]), &(vec_qlut[1]), &(vec_qlut[2]), &(vec_qlut[3]),\n\
+                      &(vec_qlut[4]), &(vec_qlut[5]), &(vec_qlut[6]), &(vec_qlut[7]));\n\
+        Transpose_8_8(&(vec_qlut[8]), &(vec_qlut[9]), &(vec_qlut[10]), &(vec_qlut[11]),\n\
+                      &(vec_qlut[12]), &(vec_qlut[13]), &(vec_qlut[14]), &(vec_qlut[15]));\n\
+\n\
+#pragma unroll\n\
+        for (int idx = 0; idx < 8; idx++) {\n\
+            vst1_s8(qlut + k * 16 * 8 + idx * 16 + 0 * 8, vec_qlut[idx]);\n\
+            vst1_s8(qlut + k * 16 * 8 + idx * 16 + 1 * 8, vec_qlut[idx + 8]);\n\
+        }\n\
+    }\n\
+#endif\n\
+    return 0;\n\
+}\n\
+\n\
+template\n\
+inline int32_t two_lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_type* lut_scales) {\n\
+#if defined __AVX2__\n\
+    __m256 vec_lut[16];\n\
+    const __m256i vec_bi = _mm256_set_epi32(56, 48, 40, 32, 24, 16, 8, 0);\n\
+    bitnet_float_type scales = *lut_scales;\n\
+    bitnet_float_type t_scales = scales ? 1.0f / scales : 0.0f;\n\
+#pragma unroll\n\
+    for (int k = 0; k < act_k / 16; ++k) {\n\
+        __m256 vec_b0 = _mm256_i32gather_ps(b + k * 16 + 0, vec_bi, 1);\n\
+        __m256 vec_b1 = _mm256_i32gather_ps(b + k * 16 + 1, vec_bi, 1);\n\
+        vec_lut[0] = _mm256_setzero_ps();\n\
+        vec_lut[0] = _mm256_sub_ps(vec_lut[0], vec_b0);\n\
+        vec_lut[0] = _mm256_sub_ps(vec_lut[0], vec_b1);\n\
+        vec_lut[1] = _mm256_setzero_ps();\n\
+        vec_lut[1] = _mm256_sub_ps(vec_lut[1], vec_b0);\n\
+        vec_lut[2] = _mm256_setzero_ps();\n\
+        vec_lut[2] = _mm256_sub_ps(vec_lut[2], vec_b0);\n\
+        vec_lut[2] = _mm256_add_ps(vec_lut[2], vec_b1);\n\
+        vec_lut[3] = _mm256_setzero_ps();\n\
+        vec_lut[3] = _mm256_sub_ps(vec_lut[3], vec_b1);\n\
+        vec_lut[4] = _mm256_setzero_ps();\n\
+        vec_lut[5] = vec_b1;\n\
+        vec_lut[6] = vec_b0;\n\
+        vec_lut[6] = _mm256_sub_ps(vec_lut[6], vec_b1);\n\
+        vec_lut[7] = vec_b0;\n\
+        vec_lut[8] = vec_b0;\n\
+        vec_lut[8] = _mm256_add_ps(vec_lut[8], vec_b1);\n\
+        vec_lut[9] = _mm256_setzero_ps();\n\
+        vec_lut[10] = _mm256_setzero_ps();\n\
+        vec_lut[11] = _mm256_setzero_ps();\n\
+        vec_lut[12] = _mm256_setzero_ps();\n\
+        vec_lut[13] = _mm256_setzero_ps();\n\
+        vec_lut[14] = _mm256_setzero_ps();\n\
+        vec_lut[15] = _mm256_setzero_ps();\n\
+\n\
+#pragma unroll\n\
+        for (int g = 0; g < 9; ++g) {\n\
+            vec_lut[g] = _mm256_mul_ps(vec_lut[g], _mm256_set1_ps(t_scales));\n\
+        }\n\
+        __m256i ix[16];\n\
+#pragma unroll\n\
+        for (int g = 0; g < 9; ++g) {\n\
+            ix[g] = _mm256_cvtps_epi32(_mm256_round_ps(vec_lut[g], _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));\n\
+        }\n\
+\n\
+        __m256i shuffle_mask = _mm256_set_epi8(\n\
+                                               0x0f, 0x0e, 0x0d, 0x0c, 0x07, 0x06, 0x05, 0x04,\n\
+                                               0x0b, 0x0a, 0x09, 0x08, 0x03, 0x02, 0x01, 0x00,\n\
+                                               0x0f, 0x0e, 0x0d, 0x0c, 0x07, 0x06, 0x05, 0x04,\n\
+                                               0x0b, 0x0a, 0x09, 0x08, 0x03, 0x02, 0x01, 0x00\n\
+                                               );\n\
+\n\
+        Transpose_8_8(&(ix[0]), &(ix[1]), &(ix[2]), &(ix[3]), &(ix[4]), &(ix[5]),&(ix[6]), &(ix[7]));\n\
+        Transpose_8_8(&(ix[8]), &(ix[9]), &(ix[10]), &(ix[11]), &(ix[12]), &(ix[13]),&(ix[14]), &(ix[15]));\n\
+\n\
+        int8_t* qlut_i8 = reinterpret_cast(qlut);\n\
+#pragma unroll\n\
+        for (int g = 0; g < 8; ++g) {\n\
+            ix[g] = _mm256_packs_epi32(ix[g], ix[g + 8]);\n\
+            ix[g] = _mm256_packs_epi16(ix[g], ix[g]);\n\
+            ix[g] = _mm256_permute4x64_epi64(ix[g], _MM_SHUFFLE(3, 1, 2, 0));\n\
+            ix[g] = _mm256_shuffle_epi8(ix[g], shuffle_mask);\n\
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(qlut_i8 + k * 128 + g * 16 + 0), _mm256_castsi256_si128(ix[g]));\n\
+        }\n\
+    }\n\
+    *lut_scales = scales;\n\
+#elif defined __ARM_NEON\n\
+    float16x8_t vec_lut[16];\n\
+    float16_t scales = *lut_scales;\n\
+    float16_t t_scales = scales ? 1.0 / scales : 0.0;\n\
+\n\
+#pragma unroll\n\
+    for (int k = 0; k < act_k / 16; ++k) {\n\
+        float16x8x2_t vec_bs = vld2q_f16(b + k * 16);\n\
+        vec_lut[15] = vdupq_n_f16(0);\n\
+        vec_lut[14] = vdupq_n_f16(0);\n\
+        vec_lut[13] = vdupq_n_f16(0);\n\
+        vec_lut[12] = vdupq_n_f16(0);\n\
+        vec_lut[11] = vdupq_n_f16(0);\n\
+        vec_lut[10] = vdupq_n_f16(0);\n\
+        vec_lut[9] = vdupq_n_f16(0);\n\
+        vec_lut[8] = vec_bs.val[0] + vec_bs.val[1];\n\
+        vec_lut[7] = vec_bs.val[0];\n\
+        vec_lut[6] = vec_bs.val[0] - vec_bs.val[1];\n\
+        vec_lut[5] = vec_bs.val[1];\n\
+        vec_lut[4] = vdupq_n_f16(0);\n\
+        vec_lut[3] = -vec_bs.val[1];\n\
+        vec_lut[2] = -vec_bs.val[0] + vec_bs.val[1];\n\
+        vec_lut[1] = -vec_bs.val[0];\n\
+        vec_lut[0] = -vec_bs.val[0] - vec_bs.val[1];\n\
+\n\
+#pragma unroll\n\
+        for (int g = 0; g < 16; ++g) {\n\
+            vec_lut[g] = vmulq_n_f16(vec_lut[g], t_scales);\n\
+        }\n\
+\n\
+        int8x8_t vec_qlut[16];\n\
+#pragma unroll\n\
+        for (int g = 0; g < 16; ++g) {\n\
+            vec_qlut[g] = vqmovn_s16(vcvtnq_s16_f16(vec_lut[g]));\n\
+        }\n\
+        Transpose_8_8(&(vec_qlut[0]), &(vec_qlut[1]), &(vec_qlut[2]), &(vec_qlut[3]),\n\
+                      &(vec_qlut[4]), &(vec_qlut[5]), &(vec_qlut[6]), &(vec_qlut[7]));\n\
+        Transpose_8_8(&(vec_qlut[8]), &(vec_qlut[9]), &(vec_qlut[10]), &(vec_qlut[11]),\n\
+                      &(vec_qlut[12]), &(vec_qlut[13]), &(vec_qlut[14]), &(vec_qlut[15]));\n\
+\n\
+#pragma unroll\n\
+        for (int idx = 0; idx < 8; idx++) {\n\
+            vst1_s8(qlut + k * 16 * 8 + idx * 16 + 0 * 8, vec_qlut[idx]);\n\
+            vst1_s8(qlut + k * 16 * 8 + idx * 16 + 1 * 8, vec_qlut[idx + 8]);\n\
+        }\n\
+    }\n\
+#endif\n\
+    return 0;\n\
+}\n\
+static bool is_type_supported(enum ggml_type type) {\n\
+    if (type == GGML_TYPE_Q4_0 ||\n\
+        type == GGML_TYPE_TL2) {\n\
+        return true;\n\
+    } else {\n\
+        return false;\n\
+    }\n\
+}\n\
+"
+    return kernel_code
+
+def gen_tbl_impl(pre, BM, BK, bm, k_list):
+
+    kernel_code = "\
+\n\
+#define BM{0} {1}\n\
+#define BBK{0} {2}\n\
+template\n\
+inline void three_tbl_impl_{0}(int32_t* c, int8_t* lut, uint8_t* a, uint8_t* sign) {{\n\
+".format(pre, BM, BK)
+
+    if bm == 16:
+        kernel_code = "".join([kernel_code, "\
+#ifdef __AVX2__\n\
+    const int KK = BBK{0}/ 3;\n\
+    for (int i = 0; i < BM{0}; i += 16) {{\n\
+        __m256i vec_c0 = _mm256_setzero_si256();\n\
+#pragma unroll\n\
+        for (int k = 0; k < KK / 16; k++) {{\n\
+            __m256i vec_sign = _mm256_loadu_si256(reinterpret_cast<__m256i*>(sign + i * KK / 8 + k * 32));\n\
+            __m256i vec_k_top_256_0 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(lut + 256 * k + 0 * 64));\n\
+            __m256i vec_k_bot_256_0 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(lut + 256 * k + 0 * 64 + 32));\n\
+            __m256i vec_a_0 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(a + i * KK / 2 + k * 128 + 0 * 32));\n\
+            __m256i vec_a_top_0 = _mm256_and_si256(_mm256_srli_epi16(vec_a_0, 4), _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_a_bot_0 = _mm256_and_si256(vec_a_0, _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_sign_top_0 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 7 - 2 * 0), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_top_0 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_top_256_0, vec_a_top_0), vec_sign_top_0), vec_sign_top_0);\n\
+            __m256i vec_sign_bot_0 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 6 - 2 * 0), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_bot_0 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_bot_256_0, vec_a_bot_0), vec_sign_bot_0), vec_sign_bot_0);\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_bot_0)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_top_0)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_bot_0, 1)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_top_0, 1)));\n\
+            __m256i vec_k_top_256_1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(lut + 256 * k + 1 * 64));\n\
+            __m256i vec_k_bot_256_1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(lut + 256 * k + 1 * 64 + 32));\n\
+            __m256i vec_a_1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(a + i * KK / 2 + k * 128 + 1 * 32));\n\
+            __m256i vec_a_top_1 = _mm256_and_si256(_mm256_srli_epi16(vec_a_1, 4), _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_a_bot_1 = _mm256_and_si256(vec_a_1, _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_sign_top_1 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 7 - 2 * 1), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_top_1 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_top_256_1, vec_a_top_1), vec_sign_top_1), vec_sign_top_1);\n\
+            __m256i vec_sign_bot_1 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 6 - 2 * 1), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_bot_1 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_bot_256_1, vec_a_bot_1), vec_sign_bot_1), vec_sign_bot_1);\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_bot_1)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_top_1)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_bot_1, 1)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_top_1, 1)));\n\
+            __m256i vec_k_top_256_2 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(lut + 256 * k + 2 * 64));\n\
+            __m256i vec_k_bot_256_2 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(lut + 256 * k + 2 * 64 + 32));\n\
+            __m256i vec_a_2 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(a + i * KK / 2 + k * 128 + 2 * 32));\n\
+            __m256i vec_a_top_2 = _mm256_and_si256(_mm256_srli_epi16(vec_a_2, 4), _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_a_bot_2 = _mm256_and_si256(vec_a_2, _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_sign_top_2 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 7 - 2 * 2), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_top_2 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_top_256_2, vec_a_top_2), vec_sign_top_2), vec_sign_top_2);\n\
+            __m256i vec_sign_bot_2 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 6 - 2 * 2), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_bot_2 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_bot_256_2, vec_a_bot_2), vec_sign_bot_2), vec_sign_bot_2);\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_bot_2)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_top_2)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_bot_2, 1)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_top_2, 1)));\n\
+            __m256i vec_k_top_256_3 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(lut + 256 * k + 3 * 64));\n\
+            __m256i vec_k_bot_256_3 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(lut + 256 * k + 3 * 64 + 32));\n\
+            __m256i vec_a_3 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(a + i * KK / 2 + k * 128 + 3 * 32));\n\
+            __m256i vec_a_top_3 = _mm256_and_si256(_mm256_srli_epi16(vec_a_3, 4), _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_a_bot_3 = _mm256_and_si256(vec_a_3, _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_sign_top_3 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 7 - 3 * 2), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_top_3 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_top_256_3, vec_a_top_3), vec_sign_top_3), vec_sign_top_3);\n\
+            __m256i vec_sign_bot_3 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 6 - 3 * 2), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_bot_3 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_bot_256_3, vec_a_bot_3), vec_sign_bot_3), vec_sign_bot_3);\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_bot_3)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_top_3)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_bot_3, 1)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_top_3, 1)));\n\
+        }}\n\
+        __m256i vec_gc0 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i));\n\
+        __m256i vec_gc1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 8));\n\
+        vec_gc0 = _mm256_add_epi32(vec_gc0, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c0)));\n\
+        vec_gc1 = _mm256_add_epi32(vec_gc1, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c0, 1)));\n\
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i), vec_gc0);\n\
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 8), vec_gc1);\n\
+    }}\n".format(pre)])
+
+        kernel_code = "".join([kernel_code, "\
+#elif defined __ARM_NEON\n\
+    const int KK = BBK{0} / 3;\n\
+    const uint8x16_t vec_mask = vdupq_n_u8(0x0f);\n\
+#pragma unroll\n\
+    for (int i = 0; i < BM{0}; i += 16) {{\n\
+        int16x8_t vec_c0 = vdupq_n_s16(0);\n\
+        int16x8_t vec_c1 = vdupq_n_s16(0);\n\
+#pragma unroll \n\
+        for (int k = 0; k < KK / 16; k++) {{\n\
+            uint8x16_t vec_sign_left = vmvnq_s8(vld1q_u8(sign + i * KK / 8 + k * 32));\n\
+            uint8x16_t vec_sign_right = vmvnq_u8(vld1q_u8(sign + i * KK / 8 + k * 32 + 16));\n".format(pre)])
+
+        for i in range(4):
+            kernel_code = "".join([kernel_code, "\
+            int8x16_t vec_k_left_left_{0} = vld1q_s8(lut + 256 * k + {0} * 64);\n\
+            int8x16_t vec_k_left_right_{0} = vld1q_s8(lut + 256 * k + {0} * 64 + 16);\n\
+            int8x16_t vec_k_right_left_{0} = vld1q_s8(lut + 256 * k + {0} * 64 + 32);\n\
+            int8x16_t vec_k_right_right_{0} = vld1q_s8(lut + 256 * k + {0} * 64 + 48);\n\
+            uint8x16_t vec_sign_left_left_{0} = vcltzq_s8(vshlq_n_u8(vec_sign_left, 2 * {0}));\n\
+            uint8x16_t vec_sign_left_right_{0} = vcltzq_s8(vshlq_n_u8(vec_sign_left, 2 * {0} + 1));\n\
+            uint8x16_t vec_sign_right_left_{0} = vcltzq_s8(vshlq_n_u8(vec_sign_right, 2 * {0}));\n\
+            uint8x16_t vec_sign_right_right_{0} = vcltzq_s8(vshlq_n_u8(vec_sign_right, 2 * {0} + 1));\n\
+            uint8x16_t vec_a_left_{0} = vld1q_u8(a + i * KK / 2 + k * 128 + {0} * 32);\n\
+            uint8x16_t vec_a_right_{0} = vld1q_u8(a + i * KK / 2 + k * 128 + {0} * 32 + 16);\n\
+            uint8x16_t vec_a_left_left_{0} = vshrq_n_u8(vec_a_left_{0}, 4);\n\
+            uint8x16_t vec_a_left_right_{0} = vandq_u8(vec_a_left_{0}, vec_mask);\n\
+            uint8x16_t vec_a_right_left_{0} = vshrq_n_u8(vec_a_right_{0}, 4);\n\
+            uint8x16_t vec_a_right_right_{0} = vandq_u8(vec_a_right_{0}, vec_mask);\n\
+            int8x16_t vec_v_top_left_tmp_{0} = vqtbl1q_s8(vec_k_left_left_{0}, vec_a_left_left_{0});\n\
+            int8x16_t vec_v_bot_left_tmp_{0} = vqtbl1q_s8(vec_k_left_right_{0}, vec_a_right_left_{0});\n\
+            int8x16_t vec_v_top_right_tmp_{0} = vqtbl1q_s8(vec_k_right_left_{0}, vec_a_left_right_{0});\n\
+            int8x16_t vec_v_bot_right_tmp_{0} = vqtbl1q_s8(vec_k_right_right_{0}, vec_a_right_right_{0});\n\
+            vec_v_top_left_tmp_{0} = vbslq_s8(vec_sign_left_left_{0}, vnegq_s8(vec_v_top_left_tmp_{0}), vec_v_top_left_tmp_{0});\n\
+            vec_v_bot_left_tmp_{0} = vbslq_s8(vec_sign_right_left_{0}, vnegq_s8(vec_v_bot_left_tmp_{0}), vec_v_bot_left_tmp_{0});\n\
+            vec_v_top_right_tmp_{0} = vbslq_s8(vec_sign_left_right_{0}, vnegq_s8(vec_v_top_right_tmp_{0}), vec_v_top_right_tmp_{0});\n\
+            vec_v_bot_right_tmp_{0} = vbslq_s8(vec_sign_right_right_{0}, vnegq_s8(vec_v_bot_right_tmp_{0}), vec_v_bot_right_tmp_{0});\n\
+            int16x8_t vec_v_top_left_high_{0} = vmovl_high_s8(vec_v_top_left_tmp_{0});\n\
+            int16x8_t vec_v_top_left_bot_{0} = vmovl_s8(vget_low_s8(vec_v_top_left_tmp_{0}));\n\
+            int16x8_t vec_v_top_right_high_{0} = vmovl_high_s8(vec_v_top_right_tmp_{0});\n\
+            int16x8_t vec_v_top_right_bot_{0} = vmovl_s8(vget_low_s8(vec_v_top_right_tmp_{0}));\n\
+            int16x8_t vec_v_bot_left_high_{0} = vmovl_high_s8(vec_v_bot_left_tmp_{0});\n\
+            int16x8_t vec_v_bot_left_bot_{0} = vmovl_s8(vget_low_s8(vec_v_bot_left_tmp_{0}));\n\
+            int16x8_t vec_v_bot_right_high_{0} = vmovl_high_s8(vec_v_bot_right_tmp_{0});\n\
+            int16x8_t vec_v_bot_right_bot_{0} = vmovl_s8(vget_low_s8(vec_v_bot_right_tmp_{0}));\n\
+            vec_c0 += vec_v_top_left_bot_{0};\n\
+            vec_c0 += vec_v_top_right_bot_{0};\n\
+            vec_c0 += vec_v_bot_left_bot_{0};\n\
+            vec_c0 += vec_v_bot_right_bot_{0};\n\
+            vec_c1 += vec_v_top_left_high_{0};\n\
+            vec_c1 += vec_v_top_right_high_{0};\n\
+            vec_c1 += vec_v_bot_left_high_{0};\n\
+            vec_c1 += vec_v_bot_right_high_{0};\n".format(i)])
+
+        kernel_code = "".join([kernel_code, "\
+        }\n\
+        int32x4_t vec_v_1 = vmovl_high_s16(vec_c0);\n\
+        int32x4_t vec_v_0 = vmovl_s16(vget_low_s16(vec_c0));\n\
+        int32x4_t vec_v_3 = vmovl_high_s16(vec_c1);\n\
+        int32x4_t vec_v_2 = vmovl_s16(vget_low_s16(vec_c1));\n\
+        vst1q_s32(c + i,      vld1q_s32(c + i     ) + vec_v_0);\n\
+        vst1q_s32(c + i + 4,  vld1q_s32(c + i + 4 ) + vec_v_1);\n\
+        vst1q_s32(c + i + 8,  vld1q_s32(c + i + 8 ) + vec_v_2);\n\
+        vst1q_s32(c + i + 12, vld1q_s32(c + i + 12) + vec_v_3);\n\
+    }\n\
+#endif\n\
+}\n"])
+    elif bm == 32:
+        kernel_code = "".join([kernel_code, "\
+#ifdef __AVX2__\n\
+    const int KK = BBK{0} / 3;\n\
+    for (int i = 0; i < BM{0}; i += 32) {{\n\
+        __m256i vec_c0 = _mm256_set1_epi16(0);\n\
+        __m256i vec_c1 = _mm256_set1_epi16(0);\n\
+#pragma unroll\n\
+        for (int k = 0; k < KK / 8; k++) {{\n\
+            __m256i vec_sign = _mm256_loadu_si256(reinterpret_cast<__m256i*>(sign + i * KK / 8 + k * 32));\n\
+            __m128i vec_k_top_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + 128 * k + 0 * 32 + 0));\n\
+            __m256i vec_k_top_256_0 = _mm256_set_m128i(vec_k_top_0, vec_k_top_0);\n\
+            __m128i vec_k_bot_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + 128 * k + 0 * 32 + 16));\n\
+            __m256i vec_k_bot_256_0 = _mm256_set_m128i(vec_k_bot_0, vec_k_bot_0);\n\
+            __m256i vec_a_0 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(a + i * KK / 2 + k * 32 * 4 + 0 * 32));\n\
+            __m256i vec_a_top_0 = _mm256_and_si256(_mm256_srli_epi16(vec_a_0, 4), _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_a_bot_0 = _mm256_and_si256(vec_a_0, _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_sign_top_0 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 7 - 0), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_top_0 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_top_256_0, vec_a_top_0), vec_sign_top_0), vec_sign_top_0);\n\
+            __m256i vec_sign_bot_0 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 3 - 0), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_bot_0 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_bot_256_0, vec_a_bot_0), vec_sign_bot_0), vec_sign_bot_0);\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_bot_0)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_top_0)));\n\
+            vec_c1 = _mm256_add_epi16(vec_c1, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_bot_0, 1)));\n\
+            vec_c1 = _mm256_add_epi16(vec_c1, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_top_0, 1)));\n\
+            __m128i vec_k_top_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + 128 * k + 1 * 32 + 0));\n\
+            __m256i vec_k_top_256_1 = _mm256_set_m128i(vec_k_top_1, vec_k_top_1);\n\
+            __m128i vec_k_bot_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + 128 * k + 1 * 32 + 16));\n\
+            __m256i vec_k_bot_256_1 = _mm256_set_m128i(vec_k_bot_1, vec_k_bot_1);\n\
+            __m256i vec_a_1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(a + i * KK / 2 + k * 32 * 4 + 1 * 32));\n\
+            __m256i vec_a_top_1 = _mm256_and_si256(_mm256_srli_epi16(vec_a_1, 4), _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_a_bot_1 = _mm256_and_si256(vec_a_1, _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_sign_top_1 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 7 - 1), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_top_1 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_top_256_1, vec_a_top_1), vec_sign_top_1), vec_sign_top_1);\n\
+            __m256i vec_sign_bot_1 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 3 - 1), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_bot_1 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_bot_256_1, vec_a_bot_1), vec_sign_bot_1), vec_sign_bot_1);\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_bot_1)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_top_1)));\n\
+            vec_c1 = _mm256_add_epi16(vec_c1, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_bot_1, 1)));\n\
+            vec_c1 = _mm256_add_epi16(vec_c1, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_top_1, 1)));\n\
+            __m128i vec_k_top_2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + 128 * k + 2 * 32 + 0));\n\
+            __m256i vec_k_top_256_2 = _mm256_set_m128i(vec_k_top_2, vec_k_top_2);\n\
+            __m128i vec_k_bot_2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + 128 * k + 2 * 32 + 16));\n\
+            __m256i vec_k_bot_256_2 = _mm256_set_m128i(vec_k_bot_2, vec_k_bot_2);\n\
+            __m256i vec_a_2 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(a + i * KK / 2 + k * 32 * 4 + 2 * 32));\n\
+            __m256i vec_a_top_2 = _mm256_and_si256(_mm256_srli_epi16(vec_a_2, 4), _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_a_bot_2 = _mm256_and_si256(vec_a_2, _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_sign_top_2 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 7 - 2), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_top_2 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_top_256_2, vec_a_top_2), vec_sign_top_2), vec_sign_top_2);\n\
+            __m256i vec_sign_bot_2 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 3 - 2), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_bot_2 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_bot_256_2, vec_a_bot_2), vec_sign_bot_2), vec_sign_bot_2);\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_bot_2)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_top_2)));\n\
+            vec_c1 = _mm256_add_epi16(vec_c1, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_bot_2, 1)));\n\
+            vec_c1 = _mm256_add_epi16(vec_c1, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_top_2, 1)));\n\
+            __m128i vec_k_top_3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + 128 * k + 3 * 32 + 0));\n\
+            __m256i vec_k_top_256_3 = _mm256_set_m128i(vec_k_top_3, vec_k_top_3);\n\
+            __m128i vec_k_bot_3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + 128 * k + 3 * 32 + 16));\n\
+            __m256i vec_k_bot_256_3 = _mm256_set_m128i(vec_k_bot_3, vec_k_bot_3);\n\
+            __m256i vec_a_3 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(a + i * KK / 2 + k * 32 * 4 + 3 * 32));\n\
+            __m256i vec_a_top_3 = _mm256_and_si256(_mm256_srli_epi16(vec_a_3, 4), _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_a_bot_3 = _mm256_and_si256(vec_a_3, _mm256_set1_epi8(0x0f));\n\
+            __m256i vec_sign_top_3 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 7 - 3), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_top_3 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_top_256_3, vec_a_top_3), vec_sign_top_3), vec_sign_top_3);\n\
+            __m256i vec_sign_bot_3 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srli_epi16(vec_sign, 3 - 3), _mm256_set1_epi8(0x01)), _mm256_set1_epi8(0x01));\n\
+            __m256i vec_v_bot_3 = _mm256_xor_si256(_mm256_add_epi8(_mm256_shuffle_epi8(vec_k_bot_256_3, vec_a_bot_3), vec_sign_bot_3), vec_sign_bot_3);\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_bot_3)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_top_3)));\n\
+            vec_c1 = _mm256_add_epi16(vec_c1, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_bot_3, 1)));\n\
+            vec_c1 = _mm256_add_epi16(vec_c1, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_top_3, 1)));\n\
+        }}\n\
+        __m256i vec_gc0 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i));\n\
+        __m256i vec_gc1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 8));\n\
+        __m256i vec_gc2 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 16));\n\
+        __m256i vec_gc3 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 24));\n\
+        vec_gc0 = _mm256_add_epi32(vec_gc0, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c0)));\n\
+        vec_gc1 = _mm256_add_epi32(vec_gc1, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c0, 1)));\n\
+        vec_gc2 = _mm256_add_epi32(vec_gc2, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c1)));\n\
+        vec_gc3 = _mm256_add_epi32(vec_gc3, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c1, 1)));\n\
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i), vec_gc0);\n\
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 8), vec_gc1);\n\
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 16), vec_gc2);\n\
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 24), vec_gc3);\n\
+    }}\n".format(pre)])
+
+        kernel_code = "".join([kernel_code, "\
+#elif defined __ARM_NEON\n\
+    const int KK = BBK{0} / 3;\n\
+    const uint8x16_t vec_mask = vdupq_n_u8(0x0f);\n\
+#pragma unroll\n\
+    for (int i = 0; i < BM{0}; i += 32) {{\n\
+        int16x8_t vec_c0 = vdupq_n_s16(0);\n\
+        int16x8_t vec_c1 = vdupq_n_s16(0);\n\
+        int16x8_t vec_c2 = vdupq_n_s16(0);\n\
+        int16x8_t vec_c3 = vdupq_n_s16(0);\n\
+#pragma unroll \n\
+        for (int k = 0; k < KK / 8; k++) {{\n\
+            uint8x16_t vec_sign_left = vmvnq_s8(vld1q_u8(sign + i * KK / 8 + k * 32));\n\
+            uint8x16_t vec_sign_right = vmvnq_u8(vld1q_u8(sign + i * KK / 8 + k * 32 + 16));\n".format(pre)])
+
+        for i in range(4):
+            kernel_code = "".join([kernel_code, "\
+            int8x16_t vec_k_left_{0} = vld1q_s8(lut + 128 * k + {0} * 32);\n\
+            int8x16_t vec_k_right_{0} = vld1q_s8(lut + 128 * k + {0} * 32 + 16);\n\
+            uint8x16_t vec_sign_left_left_{0} = vcltzq_s8(vshlq_n_u8(vec_sign_left, {0}));\n\
+            uint8x16_t vec_sign_left_right_{0} = vcltzq_s8(vshlq_n_u8(vec_sign_left, {0} + 4));\n\
+            uint8x16_t vec_sign_right_left_{0} = vcltzq_s8(vshlq_n_u8(vec_sign_right, {0}));\n\
+            uint8x16_t vec_sign_right_right_{0} = vcltzq_s8(vshlq_n_u8(vec_sign_right, {0} + 4));\n\
+            uint8x16_t vec_a_left_{0} = vld1q_u8(a + i * KK / 2 + k * 128 + {0} * 32);\n\
+            uint8x16_t vec_a_right_{0} = vld1q_u8(a + i * KK / 2 + k * 128 + {0} * 32 + 16);\n\
+            uint8x16_t vec_a_left_left_{0} = vshrq_n_u8(vec_a_left_{0}, 4);\n\
+            uint8x16_t vec_a_left_right_{0} = vandq_u8(vec_a_left_{0}, vec_mask);\n\
+            uint8x16_t vec_a_right_left_{0} = vshrq_n_u8(vec_a_right_{0}, 4);\n\
+            uint8x16_t vec_a_right_right_{0} = vandq_u8(vec_a_right_{0}, vec_mask);\n\
+            int8x16_t vec_v_top_left_tmp_{0} = vqtbl1q_s8(vec_k_left_{0}, vec_a_left_left_{0});\n\
+            int8x16_t vec_v_bot_left_tmp_{0} = vqtbl1q_s8(vec_k_left_{0}, vec_a_right_left_{0});\n\
+            int8x16_t vec_v_top_right_tmp_{0} = vqtbl1q_s8(vec_k_right_{0}, vec_a_left_right_{0});\n\
+            int8x16_t vec_v_bot_right_tmp_{0} = vqtbl1q_s8(vec_k_right_{0}, vec_a_right_right_{0});\n\
+            vec_v_top_left_tmp_{0} = vbslq_s8(vec_sign_left_left_{0}, vnegq_s8(vec_v_top_left_tmp_{0}), vec_v_top_left_tmp_{0});\n\
+            vec_v_bot_left_tmp_{0} = vbslq_s8(vec_sign_right_left_{0}, vnegq_s8(vec_v_bot_left_tmp_{0}), vec_v_bot_left_tmp_{0});\n\
+            vec_v_top_right_tmp_{0} = vbslq_s8(vec_sign_left_right_{0}, vnegq_s8(vec_v_top_right_tmp_{0}), vec_v_top_right_tmp_{0});\n\
+            vec_v_bot_right_tmp_{0} = vbslq_s8(vec_sign_right_right_{0}, vnegq_s8(vec_v_bot_right_tmp_{0}), vec_v_bot_right_tmp_{0});\n\
+            int16x8_t vec_v_top_left_high_{0} = vmovl_high_s8(vec_v_top_left_tmp_{0});\n\
+            int16x8_t vec_v_top_left_bot_{0} = vmovl_s8(vget_low_s8(vec_v_top_left_tmp_{0}));\n\
+            int16x8_t vec_v_top_right_high_{0} = vmovl_high_s8(vec_v_top_right_tmp_{0});\n\
+            int16x8_t vec_v_top_right_bot_{0} = vmovl_s8(vget_low_s8(vec_v_top_right_tmp_{0}));\n\
+            int16x8_t vec_v_bot_left_high_{0} = vmovl_high_s8(vec_v_bot_left_tmp_{0});\n\
+            int16x8_t vec_v_bot_left_bot_{0} = vmovl_s8(vget_low_s8(vec_v_bot_left_tmp_{0}));\n\
+            int16x8_t vec_v_bot_right_high_{0} = vmovl_high_s8(vec_v_bot_right_tmp_{0});\n\
+            int16x8_t vec_v_bot_right_bot_{0} = vmovl_s8(vget_low_s8(vec_v_bot_right_tmp_{0}));\n\
+            vec_c0 += vec_v_top_left_bot_{0};\n\
+            vec_c0 += vec_v_top_right_bot_{0};\n\
+            vec_c1 += vec_v_bot_left_bot_{0};\n\
+            vec_c1 += vec_v_bot_right_bot_{0};\n\
+            vec_c2 += vec_v_top_left_high_{0};\n\
+            vec_c2 += vec_v_top_right_high_{0};\n\
+            vec_c3 += vec_v_bot_left_high_{0};\n\
+            vec_c3 += vec_v_bot_right_high_{0};\n".format(i)])
+
+        kernel_code = "".join([kernel_code, "\
+        }\n\
+        int32x4_t vec_v_1 = vmovl_high_s16(vec_c0);\n\
+        int32x4_t vec_v_0 = vmovl_s16(vget_low_s16(vec_c0));\n\
+        int32x4_t vec_v_3 = vmovl_high_s16(vec_c1);\n\
+        int32x4_t vec_v_2 = vmovl_s16(vget_low_s16(vec_c1));\n\
+        int32x4_t vec_v_5 = vmovl_high_s16(vec_c2);\n\
+        int32x4_t vec_v_4 = vmovl_s16(vget_low_s16(vec_c2));\n\
+        int32x4_t vec_v_7 = vmovl_high_s16(vec_c3);\n\
+        int32x4_t vec_v_6 = vmovl_s16(vget_low_s16(vec_c3));\n\
+\n\
+        vst1q_s32(c + i,      vld1q_s32(c + i     ) + vec_v_0);\n\
+        vst1q_s32(c + i + 4,  vld1q_s32(c + i + 4 ) + vec_v_1);\n\
+        vst1q_s32(c + i + 8,  vld1q_s32(c + i + 8 ) + vec_v_4);\n\
+        vst1q_s32(c + i + 12, vld1q_s32(c + i + 12) + vec_v_5);\n\
+        vst1q_s32(c + i + 16, vld1q_s32(c + i + 16) + vec_v_2);\n\
+        vst1q_s32(c + i + 20, vld1q_s32(c + i + 20) + vec_v_3);\n\
+        vst1q_s32(c + i + 24, vld1q_s32(c + i + 24) + vec_v_6);\n\
+        vst1q_s32(c + i + 28, vld1q_s32(c + i + 28) + vec_v_7);\n\
+    }\n\
+#endif\n\
+}\n"])
+
+    kernel_code = "".join([kernel_code, "\
+\n\
+template\n\
+inline int32_t two_tbl_impl_{0}(int32_t* c, int8_t* lut, uint8_t* a) {{\n\
+#ifdef __AVX2__\n\
+    const __m256i vec_mask = _mm256_set1_epi8(0x0f);\n\
+    const __m256i vec_sub  = _mm256_set1_epi8(0x01);\n\
+    const int KK = 16;\n\
+    __m256i vec_lut[KK];\n\
+    for (int k = 0; k < KK; k++) {{\n\
+        __m128i vec_k = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + 8 * k));\n\
+        vec_lut[k] = _mm256_set_m128i(vec_k, vec_k);\n\
+    }}\n\
+#pragma unroll\n\
+    for (int i = 0; i < BM{0} / 2; i += 16) {{\n\
+        __m256i vec_c0 = _mm256_set1_epi16(0);\n\
+        __m256i vec_c1 = _mm256_set1_epi16(0);\n\
+#pragma unroll\n\
+        for (int k = 0; k < KK / 2; k++) {{\n\
+            __m256i vec_as = _mm256_loadu_si256(reinterpret_cast<__m256i*>(a + i * KK + k * 32));\n\
+            __m256i vec_v_bot = _mm256_shuffle_epi8(vec_lut[2 * k + 1], _mm256_and_si256(vec_as, vec_mask));\n\
+            __m256i vec_v_top = _mm256_shuffle_epi8(vec_lut[2 * k], _mm256_and_si256(_mm256_srli_epi16(vec_as, 4), vec_mask));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_bot)));\n\
+            vec_c1 = _mm256_add_epi16(vec_c1, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_bot, 1)));\n\
+            vec_c0 = _mm256_add_epi16(vec_c0, _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec_v_top)));\n\
+            vec_c1 = _mm256_add_epi16(vec_c1, _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec_v_top, 1)));\n\
+        }}\n\
+        __m256i vec_gc0 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i * 2));\n\
+        __m256i vec_gc1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i * 2 + 8));\n\
+        __m256i vec_gc2 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i * 2 + 16));\n\
+        __m256i vec_gc3 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i * 2 + 24));\n\
+        vec_gc0 = _mm256_add_epi32(vec_gc0, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c0)));\n\
+        vec_gc1 = _mm256_add_epi32(vec_gc1, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c0, 1)));\n\
+        vec_gc2 = _mm256_add_epi32(vec_gc2, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c1)));\n\
+        vec_gc3 = _mm256_add_epi32(vec_gc3, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c1, 1)));\n\
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i * 2), vec_gc0);\n\
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i * 2 + 8), vec_gc1);\n\
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i * 2 + 16), vec_gc2);\n\
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i * 2 + 24), vec_gc3);\n\
+    }}\n\
+#elif defined __ARM_NEON\n\
+    const int KK = 16;\n\
+    const uint8x16_t vec_mask = vdupq_n_u8(0x0f);\n\
+    const int8x16_t vec_zero = vdupq_n_s16(0x0000);\n\
+    int8x16_t vec_lut[KK];\n\
+#pragma unroll\n\
+    for (int k = 0; k < KK; k++) {{\n\
+        vec_lut[k] = vld1q_s8(lut + k * 16);\n\
+    }}\n\
+    for (int i = 0; i < BM{0} / 2; i += 16) {{\n\
+        int16x8_t vec_c0 = vdupq_n_s16(0);\n\
+        int16x8_t vec_c1 = vdupq_n_s16(0);\n\
+        int16x8_t vec_c2 = vdupq_n_s16(0);\n\
+        int16x8_t vec_c3 = vdupq_n_s16(0);\n\
+        for (int k = 0; k < KK / 2; k++) {{\n\
+            uint8x16_t vec_a_top = vld1q_u8(a + i * KK + k * 32);\n\
+            uint8x16_t vec_a_bot = vld1q_u8(a + i * KK + k * 32 + 16);\n\
+            uint8x16_t vec_a_top_left = vshrq_n_u8(vec_a_top, 4);\n\
+            uint8x16_t vec_a_top_right = vandq_u8(vec_a_top, vec_mask);\n\
+            uint8x16_t vec_a_bot_left = vshrq_n_u8(vec_a_bot, 4);\n\
+            uint8x16_t vec_a_bot_right = vandq_u8(vec_a_bot, vec_mask);\n\
+            int8x16_t vec_v_top_left_tmp = vqtbl1q_s8(vec_lut[2 * k], vec_a_top_left);\n\
+            int8x16_t vec_v_top_right_tmp = vqtbl1q_s8(vec_lut[2 * k + 1], vec_a_top_right);\n\
+            int8x16_t vec_v_bot_left_tmp = vqtbl1q_s8(vec_lut[2 * k], vec_a_bot_left);\n\
+            int8x16_t vec_v_bot_right_tmp = vqtbl1q_s8(vec_lut[2 * k + 1], vec_a_bot_right);\n\
+            int16x8_t vec_v_top_left_high = vmovl_high_s8(vec_v_top_left_tmp);\n\
+            int16x8_t vec_v_top_left_bot = vmovl_s8(vget_low_s8(vec_v_top_left_tmp));\n\
+            int16x8_t vec_v_top_right_high = vmovl_high_s8(vec_v_top_right_tmp);\n\
+            int16x8_t vec_v_top_right_bot = vmovl_s8(vget_low_s8(vec_v_top_right_tmp));\n\
+            int16x8_t vec_v_bot_left_high = vmovl_high_s8(vec_v_bot_left_tmp);\n\
+            int16x8_t vec_v_bot_left_bot = vmovl_s8(vget_low_s8(vec_v_bot_left_tmp));\n\
+            int16x8_t vec_v_bot_right_high = vmovl_high_s8(vec_v_bot_right_tmp);\n\
+            int16x8_t vec_v_bot_right_bot = vmovl_s8(vget_low_s8(vec_v_bot_right_tmp));\n\
+            vec_c0 += vec_v_top_left_bot;\n\
+            vec_c0 += vec_v_top_right_bot;\n\
+            vec_c1 += vec_v_top_left_high;\n\
+            vec_c1 += vec_v_top_right_high;\n\
+            vec_c2 += vec_v_bot_left_bot;\n\
+            vec_c2 += vec_v_bot_right_bot;\n\
+            vec_c3 += vec_v_bot_left_high;\n\
+            vec_c3 += vec_v_bot_right_high;\n\
+        }}\n\
+        int32x4_t vec_v_1 = vmovl_high_s16(vec_c0);\n\
+        int32x4_t vec_v_0 = vmovl_s16(vget_low_s16(vec_c0));\n\
+        int32x4_t vec_v_3 = vmovl_high_s16(vec_c1);\n\
+        int32x4_t vec_v_2 = vmovl_s16(vget_low_s16(vec_c1));\n\
+        int32x4_t vec_v_5 = vmovl_high_s16(vec_c2);\n\
+        int32x4_t vec_v_4 = vmovl_s16(vget_low_s16(vec_c2));\n\
+        int32x4_t vec_v_7 = vmovl_high_s16(vec_c3);\n\
+        int32x4_t vec_v_6 = vmovl_s16(vget_low_s16(vec_c3));\n\
+        vst1q_s32(c + i * 2,      vld1q_s32(c + i * 2     ) + vec_v_0);\n\
+        vst1q_s32(c + i * 2 + 4,  vld1q_s32(c + i * 2 + 4 ) + vec_v_1);\n\
+        vst1q_s32(c + i * 2 + 8,  vld1q_s32(c + i * 2 + 8 ) + vec_v_2);\n\
+        vst1q_s32(c + i * 2 + 12, vld1q_s32(c + i * 2 + 12) + vec_v_3);\n\
+        vst1q_s32(c + i * 2 + 16, vld1q_s32(c + i * 2 + 16) + vec_v_4);\n\
+        vst1q_s32(c + i * 2 + 20, vld1q_s32(c + i * 2 + 20) + vec_v_5);\n\
+        vst1q_s32(c + i * 2 + 24, vld1q_s32(c + i * 2 + 24) + vec_v_6);\n\
+        vst1q_s32(c + i * 2 + 28, vld1q_s32(c + i * 2 + 28) + vec_v_7);\n\
+    }}\n\
+#endif\n\
+    return 0;\n\
+}};\n\
+\n\
+template\n\
+int32_t three_qgemm_lut_{0}(void* A, void* sign, void* LUT, void* Scales, void* LUT_Scales, void* C) {{\n\
+    alignas(32) uint32_t CBits[BATCH_SIZE * BM{0}];\n\
+    memset(&(CBits[0]), 0, BATCH_SIZE * BM{0} * sizeof(int32_t));\n\
+#pragma unroll\n\
+    for (int32_t k_outer = 0; k_outer < {1} / BBK{0}; ++k_outer) {{\n\
+        three_tbl_impl_{0}((&(((int32_t*)CBits)[0])), (&(((int8_t*)LUT)[(k_outer * BBK{0} / 3 * 16)])), (&(((uint8_t*)A)[(k_outer * BBK{0} / 3 / 2 * BM{0})])), (&(((uint8_t*)sign)[(k_outer * BBK{0} / 3 / 8 * BM{0})])));\n\
+    }}\n\
+#pragma unroll\n\
+    for (int i = 0; i < BM{0}; i++) {{\n\
+        ((bitnet_float_type*)C)[i] = (bitnet_float_type)((float)(((int32_t*)CBits)[i]) * ((bitnet_float_type*)LUT_Scales)[0] * ((bitnet_float_type*)Scales)[0]);\n\
+    }}\n\
+  return 0;\n\
+}}\n\
+\n\
+template\n\
+int32_t two_qgemm_lut_{0}(void* A, void* LUT, void* Scales, void* LUT_Scales, void* C) {{\n\
+    alignas(32) uint32_t CBits[BATCH_SIZE * BM{0}];\n\
+    memset(&(CBits[0]), 0, BATCH_SIZE * BM{0} * sizeof(int32_t));\n\
+#pragma unroll\n\
+    for (int32_t k_outer = 0; k_outer < {2} / 32; ++k_outer) {{\n\
+        two_tbl_impl_{0}((&(((int32_t*)CBits)[0])), (&(((int8_t*)LUT)[(k_outer * BK2 / 2 * 16)])), (&(((uint8_t*)A)[(k_outer * BK2 / 2 / 2 * BM{0})])));\n\
+    }}\n\
+#pragma unroll\n\
+    for (int i = 0; i < BM{0}; i++) {{\n\
+        ((bitnet_float_type*)C)[i] += (bitnet_float_type)((float)(((int32_t*)CBits)[i]) * ((bitnet_float_type*)LUT_Scales)[0] * ((bitnet_float_type*)Scales)[0]);\n\
+    }}\n\
+  return 0;\n\
+}}\n\
+\n\
+".format(pre, k_list[1], k_list[0])])
+    return kernel_code
+
+def gen_top_api(kernel_shapes, k_list):
+
+    kernel_code = "void ggml_preprocessor(int bs, int m, int three_k, int two_k, void* B, void* Three_LUT_Scales, void* Two_LUT_Scales, void* Three_QLUT, void* Two_QLUT) {{\n\
+    partial_max_reset(bs, (&(((bitnet_float_type*)Three_LUT_Scales)[0])));\n\
+    partial_max_reset(bs, (&(((bitnet_float_type*)Two_LUT_Scales)[0])));\n\
+    for (int32_t b = 0; b < bs; b++) {{\n\
+        for (int32_t k_outer = 0; k_outer < (three_k + two_k) / 24; ++k_outer) {{\n\
+            three_partial_max((&(((bitnet_float_type*)Three_LUT_Scales)[b])), (&(((bitnet_float_type*)B)[(k_outer * 24)])));\n\
+        }}\n\
+        for (int32_t k_outer = 0; k_outer < (three_k + two_k) / 16; ++k_outer) {{\n\
+            two_partial_max((&(((bitnet_float_type*)Two_LUT_Scales)[b])), (&(((bitnet_float_type*)B)[(k_outer * 16)])));\n\
+        }}\n\
+    }}\n\
+    if (m == {0} && two_k == {1} && three_k == {2}) {{\n\
+        for (int32_t b = 0; b < bs; b++) {{\n\
+            three_lut_ctor<{2}>((&(((int8_t*)Three_QLUT)[b * three_k / 3 * 16])), (&(((bitnet_float_type*)B)[b * (three_k + two_k)])), (&(((bitnet_float_type*)Three_LUT_Scales)[b])));\n\
+            two_lut_ctor<{1}>((&(((int8_t*)Two_QLUT)[b * two_k / 2 * 16])), (&(((bitnet_float_type*)B)[b * (three_k + two_k) + {2}])), (&(((bitnet_float_type*)Two_LUT_Scales)[b])));\n\
+        }}\n\
+    }}\n\
+".format(kernel_shapes[0][0], k_list[0][0], k_list[0][1])
+    for i in range(1, len(kernel_shapes)):
+        kernel_code = "".join([kernel_code, "    else if (m == {0} && two_k == {1} && three_k == {2}) {{\n\
+        for (int32_t b = 0; b < bs; b++) {{\n\
+            three_lut_ctor<{2}>((&(((int8_t*)Three_QLUT)[b * three_k / 3 * 16])), (&(((bitnet_float_type*)B)[b * (three_k + two_k)])), (&(((bitnet_float_type*)Three_LUT_Scales)[b])));\n\
+            two_lut_ctor<{1}>((&(((int8_t*)Two_QLUT)[b * two_k / 2 * 16])), (&(((bitnet_float_type*)B)[b * (three_k + two_k) + {2}])), (&(((bitnet_float_type*)Two_LUT_Scales)[b])));\n\
+        }}\n\
+    }}\n".format(kernel_shapes[i][0], k_list[i][0], k_list[i][1])])
+    kernel_code = "".join([kernel_code, "}\n"])
+
+
+    kernel_code = "".join([kernel_code, "void ggml_qgemm_lut(int bs, int m, int k, int BK, void* A, void* sign, void* LUT, void* Scales, void* LUT_Scales, void* C) {{\n\
+    if (m == {0} && k == {1}) {{\n\
+        if (BK == {2}) {{\n\
+            if (bs == 1) {{\n\
+                two_qgemm_lut_{4}<1>(A, LUT, Scales, LUT_Scales, C);\n\
+            }}\n\
+        }}\n\
+        else if (BK == {3}) {{\n\
+            if (bs == 1) {{\n\
+                three_qgemm_lut_{4}<1>(A, sign, LUT, Scales, LUT_Scales, C);\n\
+            }}\n\
+        }}\n\
+    }}\n\
+".format(kernel_shapes[0][0], kernel_shapes[0][1], k_list[0][0], k_list[0][1], "{}_{}".format(kernel_shapes[0][0], kernel_shapes[0][1]))])
+    for i in range(1, len(kernel_shapes)):
+        kernel_code = "".join([kernel_code, "    else if (m == {0} && k == {1}) {{\n\
+        if (BK == {2}) {{\n\
+            if (bs == 1) {{\n\
+                two_qgemm_lut_{4}<1>(A, LUT, Scales, LUT_Scales, C);\n\
+            }}\n\
+        }}\n\
+        else if (BK == {3}) {{\n\
+            if (bs == 1) {{\n\
+                three_qgemm_lut_{4}<1>(A, sign, LUT, Scales, LUT_Scales, C);\n\
+            }}\n\
+        }}\n\
+    }}\n\
+".format(kernel_shapes[i][0], kernel_shapes[i][1], k_list[i][0], k_list[i][1], "{}_{}".format(kernel_shapes[i][0], kernel_shapes[i][1]))])
+    kernel_code = "".join([kernel_code, "}\n"])
+    return kernel_code
+
+def gen_transform_code(kernel_shapes):
+    kernel_code = "\n\
+void ggml_bitnet_transform_tensor(struct ggml_tensor * tensor) {\n\
+    if (!(is_type_supported(tensor->type) && tensor->backend == GGML_BACKEND_TYPE_CPU && tensor->extra == nullptr)) {\n\
+        return;\n\
+    }\n\
+\n\
+    int k = tensor->ne[0];\n\
+    int m = tensor->ne[1];\n\
+    const int lut_scales_size = 1;\n\
+    int bk = 0;\n\
+    int bm = 0;\n"
+
+    kernel_code = "".join([kernel_code, "\n\
+    if (m == {0} && k == {1}) {{\n\
+        bm = BM{0}_{1};\n\
+        bk = BBK{0}_{1};\n\
+    }}\n".format(kernel_shapes[0][0], kernel_shapes[0][1])])
+
+    for i in range(1, len(kernel_shapes)):
+        kernel_code = "".join([kernel_code, "else if (m == {0} && k == {1}) {{\n\
+        bm = BM{0}_{1};\n\
+        bk = BBK{0}_{1};\n\
+    }}\n".format(kernel_shapes[i][0], kernel_shapes[i][1])])
+
+    kernel_code = "".join([kernel_code, "\n\
+    const int n_tile_num = m / bm;\n\
+    const int BK = bk;\n\
+    uint8_t * qweights;\n\
+    bitnet_float_type * scales;\n\
+\n\
+    scales = (bitnet_float_type *) aligned_malloc(sizeof(bitnet_float_type));\n\
+    qweights = (uint8_t *) tensor->data;\n\
+    int nbytes = (k - 256) * m / 3 * 5 / 8 + 256 * m / 2 * 4 / 8;\n\
+    nbytes = 32 - nbytes % 32 + nbytes;\n\
+    float * i2_scales = (float * )(qweights + nbytes);\n\
+\n"])
+
+    kernel_code = "".join([kernel_code, "\
+    scales[0] = (bitnet_float_type) i2_scales[0];\n"])
+
+    kernel_code = "".join([kernel_code, "\n\
+    tensor->extra = bitnet_tensor_extras + bitnet_tensor_extras_index;\n\
+    bitnet_tensor_extras[bitnet_tensor_extras_index++] = {\n\
+        /* .lut_scales_size = */ lut_scales_size,\n\
+        /* .BK              = */ BK,\n\
+        /* .n_tile_num      = */ n_tile_num,\n\
+        /* .qweights        = */ qweights,\n\
+        /* .scales          = */ scales\n\
+    };\n\
+}\n"])
+
+    return kernel_code
+
+def get_three_k_two_k(K, bk):
+    bk_num = K // bk
+    three_k = bk_num * bk
+    two_k = K - three_k
+    return two_k, three_k
+
+if __name__ == "__main__":
+    ModelShapeDict = {
+        "bitnet_b1_58-large"                : [[1536, 4096],
+                                               [1536, 1536],
+                                               [4096, 1536]],
+        "bitnet_b1_58-3B"                   : [[3200, 8640],
+                                               [3200, 3200],
+                                               [8640, 3200]],
+        "Llama3-8B-1.58-100B-tokens"        : [[14336, 4096],
+                                               [4096, 14336],
+                                               [1024, 4096],
+                                               [4096, 4096]] 
+    }
+
+    parser = argparse.ArgumentParser(description='gen impl')
+    parser.add_argument('--model',default="input", type=str, dest="model", 
+                        help="choose from bitnet_b1_58-large/bitnet_b1_58-3B/Llama3-8B-1.58-100B-tokens.")
+    parser.add_argument('--BM',default="input", type=str,
+                        help="block length when cutting one weight (M, K) into M / BM weights (BM, K).")
+    parser.add_argument('--BK',default="input", type=str,
+                        help="block length when cutting one weight (M, K) into K / BK weights (M, BK).")
+    parser.add_argument('--bm',default="input", type=str,
+                        help="using simd instructions to compute (bm, 192 / bm) in one block")
+    args = parser.parse_args()
+
+    kernel_shapes = ModelShapeDict[args.model]
+
+    BM_list = [int(item) for item in args.BM.split(',')]
+    BK_list = [int(item) for item in args.BK.split(',')]
+    bm_list = [int(item) for item in args.bm.split(',')]
+
+    tbl_impl_code = []
+    k_list = []
+
+    for i in range(len(kernel_shapes)):
+        k_list.append(get_three_k_two_k(kernel_shapes[i][1], BK_list[i]))
+
+    for i in range(len(kernel_shapes)):
+        tbl_impl_code.append(
+            gen_tbl_impl("{}_{}".format(kernel_shapes[i][0], kernel_shapes[i][1]), BM_list[i], BK_list[i], bm_list[i], k_list[i])
+        )
+
+    assert(len(BM_list) == len(BK_list) == len(bm_list) == len(kernel_shapes)), "number of BM / BK / bm shoud be {}".format(len(kernel_shapes))
+    
+    for i in range(len(kernel_shapes)):
+        assert kernel_shapes[i][0] % BM_list[i] == 0, "M %% BM should be 0"
+        assert (kernel_shapes[i][1] % BK_list[i]) % 32 == 0, "K %% BK %% 32 should be 0"
+        assert bm_list[i] in [16, 32], "choose bm from [16, 32]"
+
+    ctor_code = gen_ctor_code()
+    api_code = gen_top_api(kernel_shapes, k_list)
+    trans_code = gen_transform_code(kernel_shapes)
+
+    output_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "include")
+
+    with open(''.join([output_dir, "/bitnet-lut-kernels.h"]), 'w') as f:
+        f.write(''.join("#if defined(GGML_BITNET_TL2_LOSS)"))
+        f.write(''.join(ctor_code))
+        for code in tbl_impl_code:
+            f.write(''.join(code))
+        f.write(''.join(api_code))
+        f.write(''.join(trans_code))
+        f.write(''.join("#endif"))
+
+    config = ConfigParser()
+
+    for i in range(len(kernel_shapes)):
+        config.add_section('Kernels_{}'.format(i))
+        config.set('Kernels_{}'.format(i), 'M'.format(i), str(kernel_shapes[i][0]))
+        config.set('Kernels_{}'.format(i), 'K'.format(i), str(kernel_shapes[i][1]))
+        config.set('Kernels_{}'.format(i), 'BM'.format(i), str(BM_list[i]))
+        config.set('Kernels_{}'.format(i), 'BK'.format(i), str(BK_list[i]))
+        config.set('Kernels_{}'.format(i), 'bmm'.format(i), str(bm_list[i]))
+
+    with open(''.join([output_dir, "/kernel_config.ini"]), 'w') as configfile:
+        config.write(configfile)
\ No newline at end of file
diff --git a/utils/convert-hf-to-gguf-bitnet.py b/utils/convert-hf-to-gguf-bitnet.py
index f525f58f8..6d0e46cef 100644
--- a/utils/convert-hf-to-gguf-bitnet.py
+++ b/utils/convert-hf-to-gguf-bitnet.py
@@ -517,6 +517,92 @@ def preprocess_weights_tl1(
     return weight
 
 
+def preprocess_two_weights_tl2_loss(M, K, weight_num, BM, BY, bm, by, weight, final_weight):
+    weight = np.reshape(weight, (weight_num // 2, 2))
+    hi_weight = np.multiply(np.split(weight, 2, axis=1)[0], 3)
+    lo_weight = np.split(weight, 2, axis=1)[1]
+
+    weight = np.reshape((hi_weight + lo_weight), weight_num // 2)
+    weight = weight + 4
+    weight = np.reshape(weight, (M, K // 2)).astype(np.uint8)
+    weight = weight.reshape((M // BM, BM, K // 2)).transpose(0, 2, 1)
+    weight = weight.reshape((M // BM, K // BY, BY // 2, BM)).transpose(0, 1, 3, 2)
+    weight = weight.reshape((M // BM, K // BY, BM // bm, bm, BY // 2)).transpose(0, 1, 2, 4, 3)
+    weight = weight.reshape((M // BM, K // BY, BM // bm, BY // by, by // 2, bm)).transpose(0, 1, 2, 3, 5, 4)
+    weight = weight.reshape((M // BM, K // BY, BM // bm, BY // by, bm, by // 2))
+    weight_0 = weight[:, :, :, :, :, 0]
+    weight_1 = weight[:, :, :, :, :, 1]
+    weight_0 = weight_0 << 4
+    weight_1 = weight_1
+    weight = weight_0 + weight_1
+    weight = weight.reshape(M * K // bm // by, bm).reshape(M * K // by // 16, 16)
+
+    for i in range(weight.shape[0]):
+        final_weight.append(weight[i, :])
+
+def preprocess_three_weights_tl2_loss(M, K, weight_num, BM, BY, bm, by, weight, final_weight):
+    weight = np.reshape(weight, (weight_num // 3, 3))
+    split_weights = np.split(weight, 3, axis=1)
+    first_weight = np.multiply(split_weights[0], 9)
+    second_weight = np.multiply(split_weights[1], 3)
+    third_weight = split_weights[2]
+
+    weight = np.reshape((first_weight + second_weight + third_weight), weight_num // 3)
+    sign_weight = np.sign(weight)
+    sign_weight = np.where(sign_weight < 1, 0, sign_weight)
+    weight = np.abs(weight)
+
+    weight = np.reshape(weight, (M, K // 3)).astype(np.uint8)
+    sign_weight = np.reshape(sign_weight, (M, K // 3)).astype(np.uint8)
+
+    weight = weight.reshape((M // BM, BM, K // 3)).transpose(0, 2, 1)
+    weight = weight.reshape((M // BM, K // BY, BY // 3, BM)).transpose(0, 1, 3, 2)
+    weight = weight.reshape((M // BM, K // BY, BM // bm, bm, BY // 3)).transpose(0, 1, 2, 4, 3)
+    weight = weight.reshape((M // BM, K // BY, BM // bm, BY // by, by // 3, bm)).transpose(0, 1, 2, 3, 5, 4)
+    weight = weight.reshape((M // BM, K // BY, BM // bm, BY // by, bm, by // 3))
+    
+    weight_list = []
+    for i in range(by // 3):
+        weight_list.append(weight[:, :, :, :, :, i])
+    
+    for i in range(by // 3 // 2):
+        weight_list[i] = weight_list[i] << 4
+        weight_list[i + by // 3 // 2] = weight_list[i + by // 3 // 2]
+        weight_list[i] = weight_list[i] + weight_list[i + by // 3 // 2]
+        weight_list[i] = weight_list[i].reshape(M * K // bm // by, bm).reshape(M * K // by // 16, 16)
+
+    for i in range(weight_list[0].shape[0]):
+        for j in range(by // 3 // 2):
+            final_weight.append(weight_list[j][i, :])
+
+    sign_weight = sign_weight.reshape((M // BM, BM, K // 3)).transpose(0, 2, 1)
+    sign_weight = sign_weight.reshape((M // BM, K // BY, BY // 3, BM)).transpose(0, 1, 3, 2)
+    sign_weight = sign_weight.reshape((M // BM, K // BY, BM // bm, bm, BY // 3)).transpose(0, 1, 2, 4, 3)
+    sign_weight = sign_weight.reshape((M // BM, K // BY, BM // bm, BY // (by * 4), by // 3 * 4, bm)).transpose(0, 1, 2, 3, 5, 4).astype(np.uint8)
+
+    combine_weight_list = []
+    for i in range(by // 3 // 2):
+        combine_weight = np.zeros((M // BM, K // BY, BM // bm, BY // (by * 4), bm), dtype=np.uint8)
+        combine_weight_list.append(combine_weight)
+
+    for i in range(8):
+        for j in range(by // 3 // 2):
+            if bm == 16:
+                combine_weight_list[j] = combine_weight_list[j] + (sign_weight[:, :, :, :, :, by // 3 // 2 * i + j] << 7 - i)
+            elif bm == 32:
+                if i > 3 :
+                    ti = (i - 4) * 2 + 1
+                else:
+                    ti = i * 2
+                combine_weight_list[j] = combine_weight_list[j] + (sign_weight[:, :, :, :, :, by // 3 // 2 * ti + j] << 7 - i)
+
+    for i in range(by // 3 // 2):
+        combine_weight_list[i] = combine_weight_list[i].reshape((M * K // (by * 4)) // 16, 16)
+
+    for i in range(combine_weight_list[0].shape[0]):
+        for j in range(by // 3 // 2):
+            final_weight.append(combine_weight_list[j][i, :])
+
 def preprocess_two_weights_tl2(M, K, weight_num, BM, BY, bm, by, weight, final_weight):
     weight = np.reshape(weight, (weight_num // 2, 2))
     hi_weight = np.multiply(np.split(weight, 2, axis=1)[0], 3)
@@ -603,7 +689,6 @@ def preprocess_weights_tl2(
     weight = w
     weight = np.where(np.abs(weight) < 1e-6, 0, weight).astype(np.float32)
     weight = np.sign(weight)
-    weight_num = np.prod(weight.shape)
 
     config.read('include/kernel_config.ini')
     BM = -1
@@ -631,7 +716,8 @@ def preprocess_weights_tl2(
 
     final_weight = []
 
-    preprocess_three_weights_tl2(three_weight.shape[0],
+    if args.loss:
+        preprocess_three_weights_tl2_loss(three_weight.shape[0],
                          three_weight.shape[1],
                          three_weight.shape[0] * three_weight.shape[1],
                          BM,
@@ -641,8 +727,29 @@ def preprocess_weights_tl2(
                          three_weight,
                          final_weight)
 
-    if (weight.shape[1] % BY != 0):
-        preprocess_two_weights_tl2(  two_weight.shape[0],
+        if (weight.shape[1] % BY != 0):
+            preprocess_two_weights_tl2_loss(two_weight.shape[0],
+                         two_weight.shape[1],
+                         two_weight.shape[0] * two_weight.shape[1],
+                         BM,
+                         32,
+                         32,
+                         4,
+                         two_weight,
+                         final_weight)
+    else:
+        preprocess_three_weights_tl2(three_weight.shape[0],
+                         three_weight.shape[1],
+                         three_weight.shape[0] * three_weight.shape[1],
+                         BM,
+                         BY,
+                         bm,
+                         by,
+                         three_weight,
+                         final_weight)
+
+        if (weight.shape[1] % BY != 0):
+            preprocess_two_weights_tl2(two_weight.shape[0],
                          two_weight.shape[1],
                          two_weight.shape[0] * two_weight.shape[1],
                          BM,
@@ -652,8 +759,10 @@ def preprocess_weights_tl2(
                          two_weight,
                          final_weight)
     weight = np.array(final_weight, dtype=np.uint8).reshape(-1)
-    weight = np.pad(weight, (0, (K - 256) * M // 3 * 5 // 8 + 256 * M // 2 * 4 // 8 -
-                             weight.shape[0]), mode='constant', constant_values=0)
+    pad_nums = (K - 256) * M // 3 * 5 // 8 + 256 * M // 2 * 4 // 8
+    pad_align_nums = 32 - ((K - 256) * M // 3 * 5 // 8 + 256 * M // 2 * 4 // 8) % 32
+    pad_nums = pad_nums + pad_align_nums
+    weight = np.pad(weight, (0, pad_nums - weight.shape[0]), mode='constant', constant_values=0)
     return weight
 
 def transform_to_tl1(x: np.ndarray):
@@ -1116,6 +1225,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--model-name", type=str, default=None, help="name of the model")
     parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
     parser.add_argument("--quant-embd", action="store_true", help="quantize the embedding layer")
+    parser.add_argument("--loss", action="store_true", help="use loss tl2")
 
     return parser.parse_args()
 

From 61e37b5430898061f53d9914262a95849b45ffee Mon Sep 17 00:00:00 2001
From: Eddie-Wang1120 
Date: Sun, 16 Feb 2025 15:07:08 +0800
Subject: [PATCH 2/4] update README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0b6bac53b..e8db97ed2 100644
--- a/README.md
+++ b/README.md
@@ -160,7 +160,7 @@ huggingface-cli download 1bitLLM/bitnet_b1_58-large --local-dir models/bitnet_b1
 python setup_env.py -md models/bitnet_b1_58-large -q i2_s
 ```
 
-usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]
+usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1,tl2,tl2-loss}] [--quant-embd]
                     [--use-pretuned]
 
 Setup the environment for running inference
@@ -173,7 +173,7 @@ optional arguments:
                         Directory to save/load the model
   --log-dir LOG_DIR, -ld LOG_DIR
                         Directory to save the logging info
-  --quant-type {i2_s,tl1}, -q {i2_s,tl1}
+  --quant-type {i2_s,tl1,tl2,tl2-loss}, -q {i2_s,tl1,tl2,tl2-loss}
                         Quantization type
   --quant-embd          Quantize the embeddings to f16
   --use-pretuned, -p    Use the pretuned kernel parameters

From 0ab05d6f6429d1f8099ce6d62bd6267132231c31 Mon Sep 17 00:00:00 2001
From: Eddie-Wang1120 
Date: Sun, 16 Feb 2025 15:39:15 +0800
Subject: [PATCH 3/4] update 3rdparty & fix tl2 bug

---
 3rdparty/llama.cpp   |  2 +-
 utils/codegen_tl2.py | 26 +++++++++++++++++++-------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
index 957b59d22..5095a9566 160000
--- a/3rdparty/llama.cpp
+++ b/3rdparty/llama.cpp
@@ -1 +1 @@
-Subproject commit 957b59d2207370cd5061dd1bb12d079aa267fbab
+Subproject commit 5095a956646d2143362eecbc78800fe4f6e70007
diff --git a/utils/codegen_tl2.py b/utils/codegen_tl2.py
index 4d9408123..f0d7d3654 100644
--- a/utils/codegen_tl2.py
+++ b/utils/codegen_tl2.py
@@ -5,6 +5,7 @@
 def gen_ctor_code():
     kernel_code = "\n\
 #include \"ggml-bitnet.h\"\n\
+#include \"ggml-cpu-impl.h\"\n\
 #include \n\
 #include \n\
 #define GGML_BITNET_MAX_NODES 8192\n\
@@ -105,7 +106,7 @@ def gen_ctor_code():
 template\n\
 inline int32_t three_lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_type* lut_scales) {\n\
 #if defined __AVX2__\n\
-    __m256i vec_lut[16];\n\
+    __m256 vec_lut[16];\n\
     const __m256i vec_bi = _mm256_set_epi32(84, 72, 60, 48, 36, 24, 12, 0);\n\
     float scales = *lut_scales;\n\
     __m256i shuffle_mask = _mm256_set_epi8(\n\
@@ -191,7 +192,7 @@ def gen_ctor_code():
 template\n\
 inline int32_t two_lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_type* lut_scales) {\n\
 #if defined __AVX2__\n\
-    __m256i vec_lut[16];\n\
+    __m256 vec_lut[16];\n\
     const __m256i vec_bi = _mm256_set_epi32(56, 48, 40, 32, 24, 16, 8, 0);\n\
     float scales = *lut_scales;\n\
     __m256i shuffle_mask = _mm256_set_epi8(\n\
@@ -623,7 +624,7 @@ def gen_top_api(kernel_shapes, k_list):
     kernel_code = "".join([kernel_code, "}\n"])
     return kernel_code
 
-def gen_transform_code(kernel_shapes):
+def gen_transform_code(kernel_shapes, fp16):
     kernel_code = "\n\
 void ggml_bitnet_transform_tensor(struct ggml_tensor * tensor) {\n\
     if (!(is_type_supported(tensor->type) && tensor->backend == GGML_BACKEND_TYPE_CPU && tensor->extra == nullptr)) {\n\
@@ -657,10 +658,20 @@ def gen_transform_code(kernel_shapes):
     scales = (bitnet_float_type *) aligned_malloc(sizeof(bitnet_float_type));\n\
     qweights = (uint8_t *) tensor->data;\n\
     int nbytes = (k - 256) * m / 3 * 5 / 8 + 256 * m / 2 * 4 / 8;\n\
-    if (nbytes % 32 != 0) nbytes = 32 - nbytes % 32 + nbytes;\n\
+    nbytes = 32 - nbytes % 32 + nbytes;\n\
     float * i2_scales = (float * )(qweights + nbytes);\n\
-    scales[0] = (bitnet_float_type) i2_scales[0];\n\
-\n\
+\n"])
+
+    if fp16:
+        kernel_code = "".join([kernel_code, "\
+    ggml_fp16_t* fp16_scale = (ggml_fp16_t *)aligned_malloc(sizeof(ggml_fp16_t));\n\
+    fp16_scale[0] = GGML_FP32_TO_FP16(i2_scales[0]);\n\
+    scales[0] = (bitnet_float_type) GGML_FP16_TO_FP32(fp16_scale[0]);\n"])
+    else:
+        kernel_code = "".join([kernel_code, "\
+    scales[0] = (bitnet_float_type) i2_scales[0];\n"])
+
+    kernel_code = "".join([kernel_code, "\n\
     tensor->extra = bitnet_tensor_extras + bitnet_tensor_extras_index;\n\
     bitnet_tensor_extras[bitnet_tensor_extras_index++] = {\n\
         /* .lut_scales_size = */ lut_scales_size,\n\
@@ -702,6 +713,7 @@ def get_three_k_two_k(K, bk):
                         help="block length when cutting one weight (M, K) into K / BK weights (M, BK).")
     parser.add_argument('--bm',default="input", type=str,
                         help="using simd instructions to compute (bm, 192 / bm) in one block")
+    parser.add_argument('--fp16', action="store_true", help="convert scale to fp16")
     args = parser.parse_args()
 
     kernel_shapes = ModelShapeDict[args.model]
@@ -730,7 +742,7 @@ def get_three_k_two_k(K, bk):
 
     ctor_code = gen_ctor_code()
     api_code = gen_top_api(kernel_shapes, k_list)
-    trans_code = gen_transform_code(kernel_shapes)
+    trans_code = gen_transform_code(kernel_shapes, args.fp16)
 
     output_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "include")
 

From 3dcfd14628995c74337c63302cab9f57f0270f20 Mon Sep 17 00:00:00 2001
From: Eddie-Wang1120 
Date: Tue, 18 Feb 2025 15:42:06 +0800
Subject: [PATCH 4/4] update README

---
 README.md          |  12 +++++-------
 assets/f_compa.png | Bin 0 -> 85880 bytes
 2 files changed, 5 insertions(+), 7 deletions(-)
 create mode 100644 assets/f_compa.png

diff --git a/README.md b/README.md
index e8db97ed2..05d9222d1 100644
--- a/README.md
+++ b/README.md
@@ -4,10 +4,8 @@
 
 bitnet.cpp is the official inference framework for 1-bit LLMs (e.g., BitNet b1.58). It offers a suite of optimized kernels, that support **fast** and **lossless** inference of 1.58-bit models on CPU (with NPU and GPU support coming next).
 
-The first release of bitnet.cpp is to support inference on CPUs. bitnet.cpp achieves speedups of **1.37x** to **5.07x** on ARM CPUs, with larger models experiencing greater performance gains. Additionally, it reduces energy consumption by **55.4%** to **70.0%**, further boosting overall efficiency. On x86 CPUs, speedups range from **2.37x** to **6.17x** with energy reductions between **71.9%** to **82.2%**. Furthermore, bitnet.cpp can run a 100B BitNet b1.58 model on a single CPU, achieving speeds comparable to human reading (5-7 tokens per second), significantly enhancing the potential for running LLMs on local devices. Please refer to the [technical report](https://arxiv.org/abs/2410.16144) for more details.
-
-m2_performance
-m2_performance
+performance
+
 
 >The tested models are dummy setups used in a research context to demonstrate the inference performance of bitnet.cpp.
 
@@ -18,8 +16,8 @@ A demo of bitnet.cpp running a BitNet b1.58 3B model on Apple M2:
 https://github.com/user-attachments/assets/7f46b736-edec-4828-b809-4be780a3e5b1
 
 ## What's New:
-
-- 11/08/2024 [BitNet a4.8: 4-bit Activations for 1-bit LLMs](https://arxiv.org/abs/2411.04965) ![NEW](https://img.shields.io/badge/NEW-red) 
+- 02/18/2025 [Bitnet.cpp: Efficient Edge Inference for Ternary LLMs](https://arxiv.org/abs/2502.11880) ![NEW](https://img.shields.io/badge/NEW-red) 
+- 11/08/2024 [BitNet a4.8: 4-bit Activations for 1-bit LLMs](https://arxiv.org/abs/2411.04965)
 - 10/21/2024 [1-bit AI Infra: Part 1.1, Fast and Lossless BitNet b1.58 Inference on CPUs](https://arxiv.org/abs/2410.16144)
 - 10/17/2024 bitnet.cpp 1.0 released.
 - 03/21/2024 [The-Era-of-1-bit-LLMs__Training_Tips_Code_FAQ](https://github.com/microsoft/unilm/blob/master/bitnet/The-Era-of-1-bit-LLMs__Training_Tips_Code_FAQ.pdf)
@@ -139,7 +137,7 @@ This project is based on the [llama.cpp](https://github.com/ggerganov/llama.cpp)
 
 1. Clone the repo
 ```bash
-git clone --recursive https://github.com/microsoft/BitNet.git
+git clone --recursive -b paper https://github.com/microsoft/BitNet.git
 cd BitNet
 ```
 2. Install the dependencies
diff --git a/assets/f_compa.png b/assets/f_compa.png
new file mode 100644
index 0000000000000000000000000000000000000000..57e9027e740a259401306667e8524de15c97c8e3
GIT binary patch
literal 85880
zcmd43c{rBu`aS#@B2mVKgk-8HG?_wD88c)qnF|rgROV2oC`pJ!<~c=~XQD|$sEkR*
zsL1TM?mq4P{qB9dzxR*#ulG5Q_Wqcj=f1D&yv}p2b*}Rc);xJ);|7imBob+(vXY`U
zi9}gIB9WJ`r^0vE_8kw$|4BI?GjKlTaL(Dy%*l$RZszQ0=iqE-W6tAh<#gG`;o`nM
zVxoIQg?X%FWw;_>{NXm*w
zbll@7KD!ufw%JNGy+mRP@y&<|^`$u&!rPdC39|hciy+dSkCpJ92%EJ;V~pr_96q_tlcR->H;b
z@n5e@GxE@&zb~f)0tNs2Zy|k)|6lz;$V5wsC9C1(%iRr59&-~~cb^m&zxi?1@bu}U
zNt%|65+iRC{1uYqBe)K?m23D@ZG5OHE=8G{jh#KS^2XalxqHK1
zQTx8EZo7S&_-);)k++F>kB3M6scgs7PbOr!Mkv|L{Q9MK`7@TRz-3&Ij$NEGJUqPh
z-8(;pr0Vv}Q>Q+S3^hft$$871IdjI;$|~^MI@)B@nrj&bk9ixzw}t-x-Y3|7V!>za
z!pmQy@9!KQNmIYCnXM=CZG610tBdX8#f#6KN42v)xXuh7dtOpfgYPXaE@tRvvliQR
zN7@-$tuXn#Wj?hke8|2h%2tw>x5ek_Kbt0LwKO;B=;Wm0`mxylV`0g`zUynNOY|bA
zixo{xc{ui4lT=kzg;+M=lSZ67gzj0lJ|ugSC`T_RS7oQXDg8u|ZAWW`$8XDL>nJJv
zacx2d1w2{$`47HY$a*iceJ*uT)jv#5PA+=>4JX^yt&hwbLhxa=K5HvC#y{i_MaZpI
z&rN=u`CheF+t_%#C?Yo2FvKO|=1twt@5esmCt5B|57a%jY-aj4I+~<>M<6IHtUmKp
z`t)15HPgOIFKfJiq}S@=_+IC4tz?%jU7BBA`mU8@Am;Aw{?Oz1)N_|{)~|096Z0+z
z@bWHl5Wy(a_pZHNF>u4?0*~Jph0m6=85xZYOxN9@m$)=Yi3<CvLL_{pe$takK-*Ii#i$&N#)81>mGxv^qC#m}Rr6I?~Up7x|-U}Ghbk_#b
zK63lsn_>7wz_v3d*rF-i?8lep%qa^Sn_%2-U2iWJ2gl4d0p1-u-sPLf*?lhEY|$82
zQ|Yy&^U==U{*+zOuit$>7TAe`r%%<|$j7jt28EV1A*>=B_U_#qC32dT)IU5NNVSpU
z;rX`@h`k643)5Mmq^5qPpMN&!=FJ8ZpOrvtBsr#M
z??+xxRCKI4lK;KV*+{;LPbJKx)}jj^oD;I0N81dZUEH?#`+LM#NA@H9rpf1~)>eiC
zw(Xj#XUTl?^7fp29k-4oarvuS-1tK~ui>ew2PxS{Q0En=4KiSDmr;3
zrS+ZpCi%s0OI^kpSXr;@XS@uZ9}g7{7Ob*6*jo_b&-*gF?AiYfuds6UYDs)
z>>C1kyYsGLx%?|DWsQn#?~aLubIXM{-rV{~={9d`N5`>+AHy@_`Bl7pe0<$M66Jgv
z18A5{Ei5REjg6!B7;~ITQ#<19?3^`H;Tc&S++^L)_*Q@d+%An
ze%tX(p38V=lFHp(t?%Fa$Hubbmh)W`TEqgQjAL*V(>N{*&bXh^&pT72A-6(#`SNA!
z3m1mUzE>Vil=bTW{yi~eQ;S&eLHnKntnFYEw~ykxs})Ob$(kVdg|>_`#3W_gq`~E#WyY
zT3c7AJDzyhg9>p^oG#_x$pPoyKDhby_5C4ydWO~`Dx+r?Z)zmUnjtGYIy=|jKjhHf
za^l2^x{i*`PtLtou(lQ=K1f-Pg!M7hMJx$prYdNL5JQjB%>s&m(c}}
zAJ;rM?16lm>%37QK?}#W0m<9+;>B=;?~zEp9K$D6*dU#b(!aH+MKeB$8Y*6KWn3Izesbj@*ynl2w7>9CT
z;pd2TTMA9(+Ny`{cvKkI&&ApC4CAtc59E9{>Fetg2mQf=2iENwjJUZwDa(=ks+#F1
zw@vg`Z2kK6tM$iXkzKl30n%<$rmteeY^{(AX$1uZm*+k@tlzNV+LI^yhgH`@Z#6$>D5KIXF19=bLc9zJKTh61?})#NuuWVa?JCsR>yvpeOymkkOF
z3T}~cpN(tPM^X=AhV*s)`oorqRFavqoOy@y}-
zF`qwwp2pXCZo;r=7N5;TYJT-f?do*DpY2MS+xHZ4s}>~^A3Z&Nk=MbCpM=P6rK!b6
zevvtQ_H1NYTAEI>BF!iw!uBI#NRaFn=eBJfof@&?$7U;HrQPx-n$8qj2KN+W#Z*Zt
zdB$Z~xpF?M89L2E*6l1XB`z)R_BRh$CQB#qDe|a
zjW<>2t}=hv%kS;&jSuVYUdXn>B@452aA3nwaZ^U;Gh+{XxQ48qhV2up1+4%4v{{GLaK-3_zRSaJop2o{RT2O{`Z3kWI-dMyFX}HCb
z*`V(F#ycb>T(s(((e+K+c9Gqh``(vtbZ;|F4gyo4`~2$iLbGf@dUUmIN7h|Rc`2ta
zN`e178HIwek4JNN84*=d1IKcu`sK)$7Y4xKr%SnPf0PZo|Sf5Qqhkus5#
zwC(KdA}0@B{DjybloOvt{@(bTbPcJY2K!rb%j;g1Zt1m$I>GS716PH&O
zXZH&_M15J_rYz#O8Bwx%x1JhK;4KywmMF_51P|Z7efu&|K5hFx=BSxY_EwxG=g|eG
zQg2#~`z26-v!?r-+RA)y>K7C9e3PnZ;_#`fs~1^5Yy*@#wOsWuMJ*0!PzgIiMn;xZ
zT3=t!R~AuLRz@k`Q|VP^oA9GIsxy#An3q>2%y@k;Aep$hxB@ECz`%gt!-qU)%3L?n
zbIU3=g>O4GDd0R%yN+Zo|D|3{O|2EXRNv4rFxU`EBKi3E%r7m~)z?#Q+1la@qk*xp
zIJ?;Pyb+XTHSv?$=OSW1xoj3OVrgt_wEZwU{BldYwA&_9D{j^D?VVdcMmiCVeEyCU
znm2>*Mkk(DQ={8sQc1_N(Dw{E`5`dEdU|^Ot;}l5;fcmZFI*lVy!%&{erKFY56{Xr
zQ@DEdDp9YITCE0y;Z1XPty!Z)9mf%T%
zuE&obTYq{khLT&DRBUqI);6otW&ezJsq<)j9xo%~u(pxOUIj(PUz1~PskDf^2;(x>
znzwH?iuN49rBm@x6`Txr@{*VLHM+8UcedrrtsES+FJ1^W_kTy7rX~Sr@OCT$6Uuf>
z#@_wZB@sxwg^Hwj>{#k3+xNaI{hJ5oifYpDls))zF7)Mu*qyRH@@GyvIu?xmyg|=B
zz3}rqf$kACnGdLg)=Jc4M9G*1)RZ2dF`|0DIz2#<{OA#Z%UO2m2z85hc6LtB&g!mI
z^7A#l-6QfvD&mKS+gN)hAI;19-YTC6EJS)mBs-u#b-ctSa-6wl0R}R@*c1{&^GlqL
zmquGqqnyu3(CE~d*tLF-8~avH<3Eh}_%
zbd{wV3dP06T~BRJg^Y~9dqn*})+=t%73rYrpLFp0_3Mqtk00+Uw4yy*<-Ji_y3CSV
z;ABMpVfS?1GiNwS858`
zwbC%%X=rGePzz0?7r*WVxxqF$O#fu+0L`^DppW*EuLQ>_o@md$8CxT+y076%iYY|P`euu=;AI@$;Uz!P`wdF
zCvY)^x%O>6+I#nEdVjC%R1qvoNnLsvRZEv_m}&6Xoa77olxlDe^opdocJ|nD6-7lp
z>Tthfii!(oKjax}J+;sBr_Bdg6&36O=!gWNHa&mG^LD8N$
z^bIaA{8XOnF1ny!IAVk2M(VHjqp+!r{-mO+O1qI`zr29~n~aQ%)1=s)N12(yAYvp^
zXO7{|jvr2|pEj@v`2r15D6Rhf0%U|34+Jm4bw-c=w4%0hd%M@9lCN9u$vJg44i4gz
zY(Gm}u6=U;EjZn}!-p&Ki>F_n#5tb_2_)lbVD*mVGeZ(2X=_xX_GB74PJS*s)I8_4
zG)G1MtQs$^k%eyO+xUgQqKiA%>88+<#9Q$CZ}TaqM4@vt&@}
zyfe?FD$j^}+qRqAk6!QpIr5f3l)@q+x*xq4hPRNav2X2MuYhN+kG@YA^mw54==c#0
z4f;nXA{qg1@7P6zgw%qxYEP`*`KX4Yj(7dm{UqI1^H%_H`(v{@D$mtr$rBN=@t1}g
zk0fcD0$53S%;oeHV+H!>XNI!gqUI9mY|q*9Am}99g@uKiXHc{*4b}(mPw;qgYdgu8
z;2fYiS+k%7+*I^jbjp#fRX%GncZ3THAf(BFB{U!0wryM3HfdJkS9*%=vt27xh?>)R
z%uu;|5i76P9R4%vE?86G$7c?LhTNQ-VK`a?c-xFKCE~X28Mh}^x9mAn9lB+&X0l>1
zOBvg)o1BtNBvFe-x+1@sit_S;tFyLHdXf;BwoBmS)KpYdU^usWDn>zV*g$AVmZh`X
z85yXb9Q7yM2!RfGtb-HtFm5
z=w!86@%e!p+zgvHuLCFq^jaSm7dQR;xBka;{HUwsvY?dPR2Xg}311dGd2%#KA&`NA
z!56g!Ivn{ShhDWyo)12yoUOQuQ?w3?6?T2&)@1Ee6(RjRP9iR$N!2wqQEO>w5qtgX
z*Dt4unC45au8rkaeu}zI7!V}D(a}*JaAan6#bbVPk$2ZFmE5i0ASSdTWiBs_Bz8Tv
zpvIa~(bCcqSq7PyAc6Rx>)^oNv!AdYC#t+x3eLUe(809X_F
zjcbT<>!%E?Kv(&deZ(CJsK-@9Fd!9N{47PP&aYYx0X;Lt>Bl|#gBk$lFd#}%Vg`FF
zJTh=U(Qh|A_gZo#{dgtYm8QP6!!E2vc^j@IUBx`^{{8HtE@|jE0O$eu-6Q}r=ur>H
z);!(aB~+q>{BRlRUG{php5vcNe(E;~UcPjxrogPO01A^%w%*2@TMr#~cHVz7L28|H
zWC3EA@AJ8vH*XRFy}bPMEx{oHZ+(%_1`!GCX=rL*+}@#^%NiUM^re=_v079F2g@)j
z+Qlkj6hKHwI9x)8g?xmF2WjR&vz<#%PtW{Fq7M;ac72uGPK0y%AqyW5WAod2DlJ7q
zPYpcOalBJR_{?(#V47MS1I>r>WCjKX508biV)yI|ha89X2+@R~{q8g71VA|v!F3F1
zMhn#8d0E*b`;Wz00mn~r?UTfzz5eLY4&r(cX|*3dY(e-PC5mR|DZ!#b0RaI>hfJM$
z#-Z4s0U%7`OEqNH_}({_UHN$~QfwN+q0YV&0v8v$(>1t(9oBEzd$ztI
zbPG{uP~?dq6ES*P4YIDioS;|yin3DnQC1s@nO3gR0b!F$PORUx;^Ko4RYmc>5LyMh
zy^x`)H8eCpoGF0wiTwur^{w%z_Hh}cA(E%2lBC!VJ_s3qURoTN=m1s!d9`#9P~o3f
z@|l|ihW+RD|K$fx#^f(iz;Fk?2mqE1>o#%8#F%%JQBcVvWr7IXj{E!j4@^yMf|m0)
z1p!;W?FnIKygk9j40vNI!^4|=CZ-zFo;Cm^=N`ZJ
zAfy8k2}HpJ19+LWiBps7HSf=Yosdp04Q
z8e+sOG;vXUACX8+EiF?vsF5HTk#F3%@l1&mzjQV#wPD!5{xo@+=R;BTVqD4P5d~D+
zqc=9GyLxD9G649uciPz45UTc(BS&^}9on^P0}@%*TcG&$n>HQ)M@;*YDZ{8GyIq+H
z8n7Hri~P|Y}fIJ9AFT1!8d%(cSNw;s`6PAd>-plS`oRZqJlKc0w;B)I=-rH*i
z{bj9zfzhk3wzihQA7Jn2yCn;E35aqrFgEYbwjfVrge-RbRinSm-RiNPqTLAO_Rf;h
z(qyC-R9wM7@+lDD^xWL>n;Zv7r1Nj$j{vJeF*qet93H+2&?z$Kkf^AsBX;It&cs>b
zPO#%b7c6zv`uCADYV}6GbH6ptWB~Hz{^LvV*ptLWZck57Q#-pbs7}eis?`H=CQAi@HynI!|=neiDl16$_(LRHP(sKxUtZkCe>Wkh-g}0jR@8NNvf`qzA&?+g-T;=j)V&Kj7%D0NC8^q7UQy9kA(C%r
z`tI9X0Y+<3D8TLQxYVLT>l+(+k-5^dZ-3t`q)9@t+<9>=dMc@}t%$*EQ8?Hk%ZsMts9wb=Ot=c#gI7XqZkFU0nvcJnAw^yNBrxA3jW~2JA31H;=SEWyag`8GrJM_>+V9lXF%sGA~^G
zpEEP^r%m{n6qwd<=F#w{EMIZ-3}A7;N-knVVrBJJAmfyopY=*k_N8Ga?cObag1VrX
z>Y$zGUS=k8Iu6R1`=R2)Eo4m*=WEt!de?XeH~WbV>~7o2OitnVCZXZ**`pL}vNATY
zWo~}1lZrW-LcR^5!}*7XcZX5)-N*@}Rx@SPdQc1UOec3kj=}9YFYi_M
z>1k4DJ~fvtrRx;ss@ysm5|0#z^(Qv>RnD$w8c~L2tU_7}b7QAg>(<^!u`jPIg#2m@
z$Pjwye#vxD@v(`MtQWb;UGlH5$X>nrlD&J_V|h(2_V(Q>xqmRy=C}jQszwPCWbtzJ
zG9FZHTSHwsJIAaJ+L5mL`>EdZC0`dAedYJ2j5F+PQY`b6haOM0@jpC5(f_^5@Ayq|
zcQL@MoufBtIyU+XYTX`%uTJc
zvnMk)X0&8-TSr4rO+`;txwNVEJ;g2^l3s30p)Ir49MMdhQo?
zOE-^G^y>cUg(p|p1qI&)wzjim=+ML*pbfgY`D5soE%Z{S(#b#MQ>?8}J5TO1U(c-*
z}eXSrqt6baouQ8!I7cOKkJ0|Opqe0@Sr&4K%RR#bofoe
zy!eh2A%}i`EGGN?Jv5-Jzh>~rbs_FkLWOS4K`5!HBz%SsWUlajKQy$atrD^k6&>vv
zU!(?tmrPWGVnCN7P$E(VATZ6Vc*%XLW6I6$b9D2+O)jm;Im(Aw6*_Q>IzGG_>$Rj<
zJ2rpGw2jZShTOC`=7iH42VMy=UY-s
zklMTl7F^>fEQ<;J8pm7
zl|*0`4KcTqBFT+%N{x?vz&*o>P))dzJ2=}WB`aPbRr@mPXR$c_vK?B
zk^pqjnmq!#UI00p&+6Qp_+a#-YNRfOF6mgNT?8ld2c!as)=W{l&9`(F-9}uP(3$6=
zkY_|Mev^?nbthhR>fNoCzmoEX)BB#v}
zE?I}v7EZc9U(a;m1NpI#l(VkQOb=~NnZ9DHezR$Q#qHr$7t@6w^Wxb5&t;)sD?2;o
ztGtuJ9`;SCKH
zbWa#;B(>NZx+oB^W#N)t&r=%6bNf*39DzUiK9;#ohCsxA1e!+tI;swV(m17@*RL)A
zq$l0}VpQoVdG+^HLf-hVxw-4X!89Nv*CCsj*9OoK&GKl$d1)<33t;|@fTQnS>{?E+
zF@M{7uQNvU>M!eG-}|_?NjX(8$OQ
zAm2x5Tujf*)IfN82f!JR?t$)CbiL5R@hTlsI6gl2!Ne!&Z|Nn-Dkz=v)F;ED&p_4a
z!CUt{-FdOMJQyrv2=6g}Kq*pJF8R
zVm^C&WhrN&6&JylKv_uexK@otG;4aU@_5_}`_@}ghH%KXd?qO^ty;(Qi5sd;Dikgi
zl*>r8q}o9RMuw6TLPi3uZ_nV!NV@AA&>1Cct-AI-
zsQJo9Mn-K2xHeDwohxVfcHa5jdggfa1v0+TYQOfE&bYf&)$Dv+%tZF`;rUe$W!rY8
zidQp3$!a<}%cUQm@hj`=NA5qsKRP`S_Tm1Iz;e%syL`$$&o4c)w6>n_aB5lVLuL_r
z9V2$MXcY`V8?ZCQ#CzeGldG%VM?nbdP-ml@2kU7d>s-=C*Q28t8W6PYZ8Gkn%?oH~
zrQmd;jW7y=5PALjbrx3Ew4SMv5%e@%r#|hJl$2D4fTj)hl!_#gdUYE*B2f_UGK!}l
zd-LA2XL4TwrlRS9
zGCq>2n`JHClqR(N!5EUO_8
z>ISh#z54B^@dodJDzU-o_H`XQtTq^AoKax8ooW?&;R
zaz5T^O!ikA^Ng!+ZnbfU+seY%D5m_PT_;E1s8}t`WA?LKa;on8G;^vu->V|_?z6aV
zJ=u}Z-I^L|{kWsh>TCXuph9&xWE@u(T{vYu+0gB}#Ycm4leN=&$BrEknKp7tyVAK$
zeL4aa0UiVWGb=t9UI8r=G73~L-HJ2F7mfR8FT4xUAAPq&%xlrDui@U6ABs=wRSX(GovEjmf2GrfD@vGM`YxX+3w+k?
zwoS3*3~EPvsgqq-0T~!mtKKTNN^S?%!fw@~#Rkuv^6w!hZW&yZ5oPe$19gYd#
z(k_+_(DKMhd(J%fh1!sW-js67P4<0MNbj{MJOfDapx*s3nC$G71NLuh3FcaTTT?@3
zQt7E&9LP3&8F+={iwvfK<}$=OjY9kO3@!LEYM>YJzbq*)FOPg0?&RVUh`1U130Ynf
z?``>;8|<0bETp3(_nAQgS&Uub*>UmHb3b2SlC`yUHS7UIv$!*deTt8*SO}yE!maHR
zq{Qvw;!M50y;&|;+knhWn{b!YZym|UA+f|9a^NDmLFVj0uSAW-CV^<2Kdy1tt=yA_
zZmU4tSAoTW^DTu|)x$5%U+s2q5bJGzc%*vt{fyPqqJ}GLi?l&ul&m6=->L?&WqDo<(HKT}DWJj;G6`y9I4+k{#+`vgppP)F!yLd#M}Y`LEnhI~x9?$r
zIg5;hMJ6=lb+oJj;8&=!4>L1k)1+iQ=IHjGEw9FvKZedgR2bYBbeamgz8ju`GMv*h
zjI^Zrh2Z>^m4=a-A-{{C2Vc5P(XohZ_%%WEsf^nFYRJ@Z#noUM8B+f}lX~HRy6eI9
z9gi){8Gaw}AB@;%M$W*oLD+ah*j7O*|L+~Ux9*7%av=OPjI69wP!S+#o4$Q;_}ksu
zwcBtDJ%V=t&2B{`Lrv%>6Q9fUi&k(Fg-j}!_1`WGHqe5Y@|D?7_t#i|$lEqCF;U<&
ztb&^}2PXGF$jNvKMnYABEu|IMyW^!ToC#-2
zooQe@P_(lXCR|-!i_WCv&~pl`tgMo(4&w3EyP!))N58#OUeU1i=2!=rh{)r3xxM7q
zqxF`BwP<2v`M0Hyz25qmi(%-&^4xtk>eOSjVnWM6ts*LediE`1P1pPz6ckIY6|R*z
zDd+Xg<3J>(rmh1h>?noomJDF^96c)1yO9-$MzZMmrYHV{b@9?NXn{{F1p1O}xxbfeo43$JB1h+8HeW8Xb8PT+{dL;yOAy#qr$S(+zCFyYgBz2U}wI!gwO!QT88<#q3<&L+dh8j7+
zw6MQp6j)5T*KRi|OlDTc$LR;Xe$bPXMao}co^h0bq`
z7d0bCb-TfFz}6I7uL%R>J#oBu4Y$v-AE7&)I6;ji_5Zl`{SXJon9Za3utP@dpJyDU
z&Q)KCH7PATU`zR_GPg4Cyhe4WUipkt9)G|c=Kg{0=U8>5TvT$XePh@j^DUoGA}tXI=<*4
zR>#8A8!aIndXewyRo}aJtviaWLapzv3yL;2E+ysV@tSo$-KkkpDDgE$to!mL^0uq1
zYu1cv)Ttbw`Drrr;&s;DC)yN1b6K6FmKzxXHSps|AN~Caz3@RuS?-I!cBrI0LB@)+!BsF2kp9iB`<${
z2vGH{>+ud4JEAN(g_R@W0;EDev+LtW3&g=KKIKUVOVV@C1@X$YwdZLlpMKAt9YUug
zr^g%pEuwz`^ePRm7Hek=+g%8pAziyq|7Kvk9S3q|hK(|)xO(6^9eZ-IJt!~Sf$A>B
zQ@;p1Dl&9OX96NP*Xk~$DzJ>`iB-sVdauKlW`<74T>YgqYc9D1?IJSto6vY-m>uh&
zAi>x`kay+PH~Kh9c{h<515#7@;UdlJ;bLbGhG6v7puE}fXZ-W$5*((ZLS7-E^(b=ab0+kN5l{@^F!QWD
zwq6RhI6Bxv4#+Qp$j$WEP^h7-L7Qnn9_+Y&82%a80a;-nZ&SS$_w9D)04~?n)lm>&6-8DPC%cGB(IopLw$tpwg-~D`
zlCPg185JFyFL+RYqR(m#FyPF~y;UUNp00AA`uS+B
z(QY}FCV0@Qg(VY$)y|9Da58+En84ZP+sGwjkfoWVP&iCZLBS8be-wTL7=mD2%6SUN
z^T))5eic7j8<|K4ZKwn(2$sybB{UjS0ey96yWw6T=b7w!e3X!?QgU-ebTc%MNL~IK
zW|VNJ%UxCR<$o3c`_*!@Ks905v-r17?7PZ7X$IOafc
zlrI$=ZSl@rIa8VH@$H$jh+^}=;e#H?zO-mrHX^d&3)%be(p_UZZnV*mHM_&_pZ&8O
z-l5b)Qv#OI`1tq&kf{A{dqwUM&U7@uo?jk08R^${`4QYDj|gbxy&?s@NL%>c-5H)b
z3sZg_coopLU^@NmqOSUug|Dx-hJo9l!5ulo0_eLCXC5~=9vHb@z4@2NF5bu^P&P!}
zrq0YSED*id!foh0Fq|&7Ykv1-vsG1CJ<&{o(Az2DHpK}8E*&>FH@MD@({Ev~yDWow
z0rc+Se1})`d_6$X49;F!TzQLaAzuMeC19X_XxSW@<*)NDU7
z&ZA164VXoY3!qe_#;J6@d9w-Is(j99`@b4|p@g#>a;@dFogOJ#owxY?)&-5S-bVsL
z6Z?gW68U{<=06QX#$i^JKuP03Pt{_|=o>ev;TM^nn>^j|bW&yOLAzVy`>}F=UQusf
zL{`TgwC70ENZdqCO&xRp{*#Y9|GWt!WApK7%1%T+G)MvG$sXPCx_|wIG1L0RrCZ;=
zS)-wOBK+w8eET-PUk8M%2^^X5qc#8CTg>QzE7Tv6{--allfeMfMJv39<=!hslWa9L
zHI8m>H;}V1QqWrF<`8h>#!GEEBz9Q@(mySo+iEm9D62$g`pug+Y~q&dkB4(+7N@}1
zLz$(Qi}Rw#*F87-UJw!s#T^0lT101o&zd(;F$n|p?h$x~iIj&6mw;&nW^~L!RH+&7
zU}UtC1BF@aR6KFw5wR6uLL{QN0;58bYwiw7?FSTt(bywLziD)r+
z2TB8Rbje8YNAkqPp&WsEp$Yya?@V7M=KM|bOu0KFDw1D9f^i)8MJNZNX0?>)w5~&gjPPim
zd>}&uUQhT12+yQz6^=wQq;W#HL=*==+Zg%g{w$1IQ8H9)V4RJ)@PV*M!J7w%?+-eC
zeZ?+$@^&)P8>WF!_iKk=-n%r`z7sI%
zAu+oX6f{3p3C-;&k|)t#yd$VFh{V|S{L&6YV%t+6?;0={IT$pvqS5591D3G=3{+5c
zl@82j|HV&iFihS6=!57MM2ISoY;CgYraqU6K}aQN61-qU8yuF-5O}%Lo=5O08~l}!
zzg|$!fQjK7%23=(k1JP_ppO$4{JgUjA;`pLVC3j8Mv#y*w3aGiU`|FgLgv*fbC|>K
z12MeTOt`#Zh)ip>Cg#?`J_WT~sq6FlH6VR_+)&aT&+L`F$HUbPJj#UB=DPn#B
zN*=X{$ffaORDc-~Wp6%xqM6;Y0&byW0t2T)l4eFOg)g#q`qeNjS+>^$04iXs00@iF
ze^#?y4A2Z&oSO{AfCD*$!@CF=jxiJeM
zvO7XLwbz#wF8BDo2~}#*c7Ke<(y&U#ZXf@m2nd&AB{;sOFxM1ugm
zR>aaiuX13)Is{yifss**rYdi8RN(dN->~ak>aF3CV&-dGveBeD0lnMBjW^
zgBSw?BWrWk!0Ee=aDjlQGY%1*a31X4o6qvQ%yEF6pnmAL3w9qo_lhly+w17Kz
zn4w?z0WMiin3KxAmRK>;N6!wPyNcU70xMReu$=-B#0hKb4BIOUKT;=V1$ebXLPP7olC_>)G=}?SGGBLWM*w01V)?wotMddZi1a;ie>or@!4J|Lw>}-
z7Zh9f<@rm6OF1T0TmV|lb187<5nei|NE6lPDS0DlEfl80Xo&iac^l|{1NTh436!9j
zsf8j4hlhu-(bM(FXH7Ugkx@SA
zTOyan4jKppXG0`Db$$F!$9$N)qLZ20iwdzLs7g?AKm_hvRsgr+zu&v$Oiw$)q^wBb
zT6$SoSAaE#5dF
z$SDwCWT$;U`Q6w0db7
zvE?5f6;y2c>24p&(f02^9V2u%_!?J-0IOX_#(!7zOhY#nCGj#7|scX#)U
ziUS(2Ac@m+$tbxB*8xFhSFZjTro}L?+4=MLzDn_*v$x*}Ya_slru_<}R3xX=;**hl
zM8_5rUq}7mO5ofLOpcXQe=px?7ziUxA&14aTlX!lp-$p}6J5k1w1Kj?Z{54Mytf%Rl-zBWNaj(?A}d8QrFPf+u;RL8A9wD
zHk@eABPD{7IO4S7YQ&_o5_iC_BJ_l5?eQiOP!}OH5`$>MhK0mnah69R4A~HmtFC^+
zpCzOJMtD2HV+gek*prWa;$RX&nO8=J6UJ6`@zntw8BIXyuI_F#?Dzcq%nr1{Q+rmw
zG&2JUqI|8#Ajx%T0;HLP#*pFx)rt8u%oksS*g%+kK`3H?Y6u_*IA(@{4)^JPGHh(R
z>lDsrHNxI&@t1OOFJVp#4?mejZ*F)KOZcTa=GHT;AdN6u^8{4%SzQq1l5r=I2&*d9
zCe9ZEC5nn`>G2yg|BO_l$`jUxlu`$1hJ-(dNZ*LN0AvP2-XPiw$Sa0A7QD&j`B*^Z
z&I%ZB2?HWNnGg}+gjOob#j4nF)D@u%4B=oK5uF%5A#9*PhDzc=4hlF~$L=4r2WxVD
z_2BSn`;TDh#31wh{QN_}4G9>Kv1(~Oy+pHE&gbl`;lkICxJxh`qJM+>(9y`pF6$`)
zbGDdy{d!_d7b_#-GNu!0^A>X+5iqLqN%O7eG2$moo2E6_OjaskqB<8wb46M}i0@6)
zgPei_QS=9RQNK=5U(iuImEyr|v%x3jmwj+pSOC0Q1YhI)cB|0DR_@*>C;GcbFP21y
z)l&d#R)+uXn!sX+5YP!Ri%yL9N&S)Rzathy@MdMYdYxB$an|$r!<=gOTPu|fF}Lo{
z$Rm4UDlG`ZTLOkv2oDEWQ!|1FCI8COPTlbJO#&^ZzvJ^%;$(bQIk8#i)0*u0c#~89
z;Bm@M(D9DACp%+fVoYJ^aEiYc^zD8z))2N1!7{*FvGnW{ic$F(-br`Y==|_O!?iID
z2>Xu{L->^alW2c`3c?rqzJrIi1%9#_CnNI}XI>QNj1r+SaB3yN6RL$FGgrmUa?H9ox;5dAw)~EC37#<`Du84u!o7
z-m^jEA-02d28EB7|8hPIBi=;{@LpSSBNm0QwM9ouSi*`2g|tzuAV}$8{Qd)O18@Dm
zeXRY*F|~csb$V@|)p5PmRUidO6pcW#GY|*s;3hy>Aq?t(NM9sm7>6&nh&_Z4h7v;p
zoOv5};uz78n!fS2jJrqhOTsQU?Df~sg-RQhlbZ_{B;?p!+u&o0QHIF$qT=GU=v$JJ
zhz1((BY!2$H08(u#KZu~BYt=-E$!S%{oo*RzuyKebd8kB4%9xiIDGUqMAg0!0
zhO_{l+<#h3gfkdVFhH-Fux#&YvI!F6hCPsiq${5qe;zi^#S#Xs-bN2$D^#C%&)_!{
z-@SXtPnaeLLMw51c(%gP_i5_72&OF@T?~62%DZBfQ9=zxNckbLF-szuL
zvZqS)o&aK?82^aQ0CbXds?owVI3A9pZT!T+|EocOh-=C6qZ%S}*1&8}069q4GY~wh
zF{XxADh4aA|L0q?8j#Skv&k_o)64rGf=_lUb6qFMv*Xg7ea8qgR1(hd{N%GfxZy>b
zcK-XB8i|o${8c_aP^j0FrYE}h5zj%u9uxII46c63lQ=swAr8b~+<&Zz
zYPi1+T5k5N3aN6d^AreQB8U8a{n%;9WU%vHN5|`yx__NLEm37NNz_W-4cTD#mDVl`U4NqghB>fE(N05T)`5B8M@&6~?>=>c5wL)e
z?+NERN*(TjjN}M!3!eQk00kN-TFO?BrhtE?UYrHtDv|Qf^^xhIG?m_=rhc*YDgxZ%cX$j*X
z%yzI$-vGwvWtM!bv#_wR9nb(;S(XbtGnm5<0PrDIF}u45?~g2cE3X}Cq$zFXzp;@S!UV++mrBQ1|9o~
z%*;&N)A*8bh8JJ>;|w>@6_7;R7bE1_UYlrX@1(xLXC3(Kr5+aXJdRqzZOKldiWVm<
zCqyd;)GVE0Ms*s2a*h;(dQJu>m@4wR@_lmLtC48dN#sX@{dyk|<;0DJ8O;$AU$hp!=T&c@{U1^~wYb`Ng0lE!0gI8LF6&_lF97_P%PMDzwK+n**
zA}2}w4^0yP-!zFZfB!?1cv~v|>&?XvQ0eJmlAs9+j85Xp@-m)7K>-_3l&~Qa7NiLp
z?;j}G)ny8nieS=wtziCtvwe%bCVbyoQ|LM?Dw3efF%H*Kc7is(gBsp30{BTzLYE~R
zQELWa62}p;epy{Czr#txs_<)W}hK(qk*RN9%P3lzDd*_yl@YY}dU;W^c
zJvx|8zCGoYvUB-1lh&ISk>8q9_@lO1
z&2h!?TbMR9Hr66hbIWmjem%~AJ9E8LS4t>yz|IU-xetHm|
zbtFznCrV^K0`n5XJedA^@n0#m0#95)G?08j3Wm}Xy%kEFhFdZ1{;AUI^naER@|RE)
z;QKlXp9R6TapWjS1OULG^?&5_Xin-;%mIMcpND#krw&y&G^`^LIjO6shd?_2JEg0`
zhYRjq3+^-XUPJB(jsgijnqZ>a@?W1%O9C%Vv^2&w3`@!UuZ+)M06DuKgW3pbUbyOD
z$lSQ~i`2${|H?iT1(XzGaoMnb(<4q1sr4pFtq~aGKVEN|NWzo>zvfV
z;o--joqdsD`S-Uukn90=YA`4FpWiNJKzbntRwTR@_h;#3UL#+(A;aF}xnmM%`oARm
zWKSh1=rDh3qSHgFCR#Mbm*n04y(Dyab9i0u4vGFPk&24S1kP;3v2gtHMVZhBNIc4s
z@izqj{>?w{h-tJ|bSu*9PfytJV^lR$#R-z^JUnP%2%$ySTJ`%hk#AAbsDpASt
zoS}gKD0e{HP-O_WDN~=7b{FPk%KhD`Vd+BW?eXxA!C4Z3CmrpOlHx##YCs`mgl2$hfmnv
z&Zz&FRsV<89%FK&BJ#LHTM~0~{Su`8vh^EAsOf#jI{aM5eKEgKdf3Bv>NA;x!$XBY
zI+G3m`Y`&!r`6bVCXm1YB#|T>KO(JQGVT&&FFe=BDE!o|GhA|1!orF#ce%M$_sLyu
z%05Mnkp`~AOo?7z$FlRzy{!>+m}D=shepD--})A3deSQ=XUc>2RI*V8fw
z?^0J66X<*u%PX0--;u|0S)w(S_F=Awk;_JQ-k1D;Z{yuFjc8T_Ow9#8d*&87OF&9Y
z#u9swWVo+CJ<)fUE=gbiw$26`M&m_s+qYb%ORlC(oBA6!n*KcZZqWAUxml;J2T9K^
z28-W67<^;^Lx_J~ZSshE!q5Yyk#M$TxzMr-!{>D8(nOaSl80f;v_{W`sZc7ZF
z;6t>#dE5N5dS!%97pv)>!(+9G0ct{I+^wJY5G`&;7nft;JQK6cr;bNZy-B2gAh+cSb+OHk
zNaWk#$43r|yHqj?JZX)X
zoe?>x*B%%obcc;ud^<0BhS6p`1I~W?7$lUmlN}|Hosf#eE)6O^we8qIB4#;b;^GEDzF^gP
zWarX+`D_JKa~3_fVr!;8{c}%4&y7Kb8pIF
zUVhS{FXYgnl_iQhf}xYAi-R5)U+kp>wJ5Z|FY!Amc?8T{?>(|Y-
zq}&R|6zT3#gtn2TQs9L
zsbL07rQ&HUQCGIh`F2Ijp2KnA8k>jyXblB1N%2|gT^ucg#klMDP3|6-i=B8?^sX}-
zgtCgLxK^a42@o^s$XMhgmJNY9G&#^{Oi^h5`o*X*d$NqMCi*HQLehfo1hy5eNegT1
z^>7BbU%8UX7N4A)j9Ko~MNU>$fzt_>ZyJ|r_vNXn>@+-0K@(>znC)n0I%EErFa7kP
z$9y{3(bn(4)0GVwnSUIUxmC9~>l;dIYL!~KI%6>-epG*pqZ5+_Z?y2v;se$^rE!=v
zYCP^ar&dncU3RZ|H*X7z?2a!A@6#y+1mwT>^7`Gqm7D8+Ma=P``{UfC?N{wzCA|Ey
znK4zUjz(BcPEK_Z5!$B?mDIdxh|D^R(_C!^eKt*t}O%&6aknr==%htX9Hh
z!?zC_Ip2G?S$B(n-0K%q3H<%SYIQ}w{M+=5a>=V|zvT4e_x50JMe~V(sHIY|?(Lu-
zT-}Y;+F(WY1&Sq;-A|~YS3s+mCMJ3a=MXWlg5cce@(Cisj{W;}iZD&DoyLJdR=wts
z7|2&t_YZ}}+A-6=_?Q#R#+?_$#GjT&Gz2)6GoTnD?(4O@W-P&C3e
zfoB3{pGhZ?2rmTIGCM2QfuQ+t_Y$N5HSb3ETGMB=U^c#DewFtdt_i4$A`L@h-@eHs
zL^?~E7_~ZEh-;qS9LdKeQR=$6IisD8J=idd>EtR|P_QCNOx4~|6igB}~q
ztC1K|^YQf4Ad~d?q%*3t?(+;n#zvlVvzz^2T(Ay`E{l!bmJn2V1*Oplp-DW)0c6k&
znt0cf^Ykzoguvg4>2kf}gm?sVXGbS|%AeL8LXfU82>
z8FYcSQ^UW@q+Q|RQ(@=+-2ImFy!?+}J4MX{-1mL|CREC>i`yLQni@PovpRrg(8SFR51NS2I>0zgMqDGh;(B>!f-llgjbH38
zM9rvQC%|U_B$}p}NTpN%h>T7(W0dJT;OE
zL1+2TGb0>OecPzbta~as!gqn8hK*hXHOjzJSBkoIcB7`Rcf{xk)8AIK3
z|2MvLw`dPIBSu!m2Nqps4xX|3qBP_1f2e!!c&_*V|Nn)IQkj`$Wkw_oWF|SYEto8AE+Txw1**Wpr8sF&$w~B(V_NuG#kXu{g=R{o&24FT$m_#_Z>qf(EXRauYDmsYIp?;&;51&5!xa2pqTk7Y8!NLcbIgdUg<~8!>zpHPX)>{u19eQ|?)X*?@$y)dKZ#+iAqKaGto!%PtanI>bSzb-v0
ztZr&|=;^cb8_^nr)RP+s546-pDFT}|abRv492B(mox9HOdugSW^6(YVKAy@UA;v<@
zT$T`p)uOI8dn8GLccczT5ds|f*D)WhTSCsk)2!2>3aleE4*xtJW-av*)XT(K_5%Px
zz*>c&lB1#!MK2Ln!l(8SDUqS6n$ug3xBmgQ$4R@
z+NTzA`Uu#$geV)fV7P01cFLV-&Wa{
zXZ9GHMTFJ~$)ej;tdUTRz8M-4akm4ecdzVU2!t4Z42Piqr4}_vV({?rD4oJ|k%56B
zYLI;mHg_V^AWBHmJ*b)g$_Vga7+Q}s6ezbT6tE=S<{;k9E)x;MQUz~>8@WUWP-*_U
zKzxLV;Xk)~Se=WL^Bnxu!lMpV$IoCCjpJPDVdx{t1dl?&yG&#jxD@JW2+8?%|EUYT
z%r1q56m9(_t-!4H96@CCsaRP%Zgg#1gc91^-&~xXG#X{2@88=pRZQtK1jA6nGv5YNDgybRpr(ycv1
z>oCnF>mo)%_lcYxIy%yfvMk=SjM6$wdWRsN&|w(O2z?(p6T=_{SaoU65kaBUr24;F
zrA7f5Wie7pN=nFURK7eu6{3d$wEu~@nTSI1yUBA1OJeqV^i$9RT7V8T#+~@9dAjDJyyE+JCm`A2Wf@Xx2tKC3%~pN1
zf5((Ivt}QK42QB{+||QGfF?mXmmP~!3f0D^zSupk0AQ7!f<~D
zLzM@rBDq5tJ0>Wf*o2m#*nekenBe2B-34Jhv3vqewG@>LQBy#x5%Zs}9yJ9|0p|6C
z(6Qm+J{KAqYJ2YQ?afcGY!SyV?HFWaizpCFvctRk^dH@2VeDpvR|cs*?f@f1We+fl
z!xQoWCsl^n$(&sRF!1(bTx(m0OdhZzXZ-FQG`)kNO
zk*Cfd2La-ZBDgqQlKW6d;nbErhF5Gm8tUu+^)0kh-@=AR=#iu^&azs|5o@+v7n*<}
zm@DqdKJ|A=S>uvT%RwQ7D~w$b0Dx6`{fT3Lf13B}+o{8~Hvw_7hZ}1st4z2%*_L@@$J01{NR|ixjhENkM2EZ+t*56P{2Ii-+wRv
zbFmYx<6f~((?iWPHf6$V|AN=hC@QLzd)1B)dcC{`gj#g(u`i!%)Osq^T2qDiUcIIc
z=g5k;Dre0!riG6n{1V%j={~EYi#3Fkm6B`0#9a=_z*#2#JZnyvtso6b!Nix)&>;u>
zg3To9mT2X8}jWvTW%cA11DKlzq?X?)b`8L*7uEWm4ObTRYaJ
zc1aYi7iQ+Dsh8>+3a4b>@UpN7+Uo&{Q}$KTf_ktqJYLY}GxhU~P>uElvCC8xjqNm;
z%(Nua0LjS2PmNB3$d29hE0~RzJ1fm-rETV8U$bZ_pA@SD;4FL3hOz}=d2jZcUbSxT
z^8T@9wl&Lv$a|I@8^W&ogQ5J_tal0k-G&Y2h0$d)HefbyrWQi6>$)_oX}I?$+^A?K#}*oX;MUy~ts6n;KoZ
zRf-`A&k9#R%jaL`bHFcg8E;^}M*doarwIGX;?$zGkA;;l#42`X33nAF@+sj)x;V7ye6-t$AOo(bpXJ^Z^Ut#x@s{hNbR{h=21ZoNIM@RUUlXLyI
zfqKno_Yu=^t)Ddufq8cISLWFpW`$F`&y5GZ=&qEo*OK2NR-?R$<5j6{$)uMT>nP6#
z<_A&UV@rb1hB@n9UX)`aHbBoZ#MQ}aNp#B?j}_uh&qMD`34}mkwUUx;o=GP6hJDP0
zS`SHm=@hiD(2*J0ctRLSRL00E(H&7=e~?TcSO+b0Z5PjcrVql@mCZ+$$K#@B3n;tX8ed0#oG^OVDTnK~$voo3M%U_tX5OHYmk;6O^qwmI(y|P(jo(=9C3rAwt2&K5Vh*V5Y
zeY#o#g8qQugAVga7YRGCN=+)2EnZiw_|UE!;hCDX*fd}IGW&@WxA@&3(6#zmZre8L
z-}1FuRzp~<+?jfCE8hlJR%Z4^FY?y1T4rPn>~ii^I^;=T*SwsKO?G)$hUm^4XSvwM
zFQ^6v7mKze%UJIn>yaEdbKc;oEgMTZZ!(Ma(0y6~C0lRC`6&z82<@8&<6X`1^nQLC
zjJE((dMtlZMI|&nciH|v>1(<W~x23BwyprRSz+GwAJ|EYG{yF;sq
zt`w-6UD&>!>37=+u)&nF4EuPy6HxSBg%UIo5gNwBk1oBv%_$?@AR8J&v9Dwvv>F}_
z|8#Cgx^B1{JKyq3KkmL*;a%L6FH9b6O
ziiE9Y9i!S~CCj`dZF`B%&d@pwZ}0hvn}(XrJ$K{}NeMNvp=@^Bzdx`kwx01b%W943
zFTo|Hn#Q-ZmbJE(xtWchQcBT{udBIt&q*+Kr5My(IAJ8n9}_3GtMec*Z1LN@4)>f21MqQt=>7rq&JAMcI_TTFGoT&`&B}|-5
zEbrcB#XoOfu{NjLkL7gPZMWDGy%hBUd45%DF~jWm#mhPD=*9Gy7b@LIsWcM6Z<$Nb^l0L?iFoQiAJGvFHP?sndbWQTaIKitZ{B^;GcM_(;A!R=R5vv
z{ho*qUg!Y(CH|T($yOaOjY%lqJ^7);uw7R}VGTK1ZIx!`%j>0n*<$EDwQdvLj<`h)
z7dWSXuC=NAhEFePvwiz`k<%OX{->)PC*I9Ch><~3F0j`d3ldppeizD=^*Jz(n{ZM?wwAU;M)
zqM@s&aNA?0V_(x=@$vGio4{>GTr=PFd}v!SZ-h-qHH_6GzhK^REBHa?X

?L&9B zpJPhByS?goYD-1Y=((Dc^ETd;kO+32n|oX8ww=9lg==EXn>%zVDMD&h$3;#7M(=dF zo0_)n8?MKLM^CN<#>H8dd9c0zIlg`BPBg_GWlmR-&A4g0wkeE;-e?_a1@n@V(Z zcCJ5N|KQvJUtdAWhuK{os~j5IjFplixC9$ENEoRL7dJ$*b+ufj;gF$wdJRa4x{6#I z&D+tjlA6>JAzo`6PJ5ajAKYr2G&9%NVmR;NP*IYw3xxKW^_Zc>4+aGVrlf={7Bi%d zk!la`=cH#5I5ciXNJWZl#Ub|+^>LWBo`2iX+hrODWO_wKD#YGtG zzbvn~w_Vxd{H!b=vymGY+rZFny@%(IQJ^L=YJ#vWHXX$;SzCH~R^fo{wJQ3rBZ>Ak zD+V;pB4LygwyHrd?(x+v7XLo^P`g7BRABaWGD;0?X3~rLcfZb#;q<9fus2-W?T?3- z74MNb3v+~0R|a3lwoE_gI*|cNueF-)&*R+ zD5SuDzQp;7H1)SIo)1QbGiWighehZc0{Qy;D<%PNcg@02UQGG6k;%J!_h!Kx3Q9WB zxB341HNID&3z9HSV}R#t=s&t3{MIc*v=0^YqeDpb`17>_>#nqA0fh+O(XzuR+Xy=1 z(qCa2ga2#qGCgYhbrR(Qg&WYiNSx~kht7@B=o%+8#e0|pRdOVt&iEoD&m+S+jIb`nJz0(eTVr!NFj9ySg z?d?r*cXvc0&X}xh?c}i9a9d!W^v=>X2M09F?Vy%>BvK_SxQ_WlH#bBX&z@1BRj}Uk zJR_}zp`yZSr*CTIc0441&d24?8!wRK(Sx(KeJ8`yUJN#>T@;e|JUA@DiuI97OAAbX zp*wcv)R^GJ-?C(p_gDN`94ECP0!2I|j3J9m^6ep$aqW8lJ~?Q!XW-N*tbTPgdbYOn z-*7nb=n5=#uMdYYGiZ}Xplh=Atn1H=KO>Vd*Z7 zJ}xx-sD6ca<=UJhgIRfbuD}n9T+GY@FC{{}8oJ8^)6=!#S*3mTih?z1^N@s9B^TEg zj!Y{yaT{~tqNX_7^DB*HZl=>FB-YE+ht+$+ocgSTPTDra0eT`wCn3IIG(_h%y3;*Q z5MXDMPbFw-`Uixp9TXHoIalw%S*7L&-JNo(^c_?{=~deosO>*FZBQ3tRXi|V z?yXaYHch8=WJpMxM3wBl*F2U*i#~KLTem^4JV`y}XN030Pt6#l0ghW88_&-iIq@yT zUTFH0%ZK$LUzgw=AniXW^@z{+ak45u+TfHx3=0UJ1#h7E`;{t7<>e`=KZK*VfBQDD z*7tZ_Jr#(c!q;*q#+_T=-Z7P^?OF9@`a)mF^`L#`-KCO1N_AH*{&gm}$ybL@H*MSy zC9Zuh#Ji-= zpzEh|?9PXy2V57IxY2f3@zy-);7CyA=aS#4w59Sm?NTlQ`wEN9sKZA@o9H0RgIPNi z{h@x`r)T_TWVQuTykB}*SM0KUy=P)rNUTaf~v!!)Z~-r9o~Ns zpS_bTFL?g*d-@moG9d8odCY9LVs47f`QQ;!&`WfsL;KEg;+vjuDzs3`S2C~lo*rXk z;@gpzS9|}=K)ROg%7Y^@T)G&hR7YIleRezlSf}hikfAI_20BVorq*#vStwiD2A&DF zRw+p@&otRCoE<~Ae9=fdTMK4MA@=;+&c3yb4J`_nqWOfqc^Ue821V5xp{!`76ZxHj zn6v?k`=dPtw>g=FZkdM$x4r!Mic0T_=&7hso%z{4PX=o8j5UQza&k1-`g^PDk9&AP zSXJ25v}UUMD|*3mb@ib)Gsn6H?}kg+GL(2S&40hS&$V=SsJ21p>#(A9T^i*R*-Vqm zL*fNv*HE39rta-^QbW^ZwoG0jR5Qz96^NZ^GtB&$o|)p^RL4Yja{2(@(^o9%S1~gL zjtE$rxZT8FwzKlsA#55khaW$=w5hSQ*yQu4_-MPkfhztQ#!dAiQ$<=@LJZ2jsntK* zM8-W27Fgmz`Mhp3ryV`p`6Ety>9vjLyC%t8*9@@aa84&bHh+Iyv=m^MUgm2WfBZi2 zv4R?N3LV-dA)FsVlUe-yHM>Jl=@;rwF1v)qxyVHq&5u>Gva%EkAhkGw5%)5BR+q?i zym`QG%1VvW-@3Y0XwAkgK4F_bZ1JIfa^>U?r8jQ|-qdv~1@t_>SsTLQ^my&|?X3fn zZyuYGR(gA`WVFGXH{i?a3ekz`8T0=t7+Iz+Dv#pgupP$no zo7;3E51M!{gZ)wm=gMZ~)DISXP?Z0r`?B-jubJveTnlLSld=yE>6!WOa!FUO8lj&- z#yF5)ljf|#-a3Z5X$MM^&k}*4PksZUv-v3|)6v?i`T}Fkgeq7Vzdx;Drm86xo23 z3xA9GO!7vDL)Tm6g~;Y#Q)ZKE+2pat-aKLz@5!Xa zY#K|gD%iO;8CN_MA#L#4pqvoxpD=AN}vK@N=(FzmAoURJ9x}aFe`nN)OCVl`| zMrmEO^HG;go#m4Hufk05*_8TGzY@&{@Z;4+YWAe9JT4hxBV}2tk^(h zBzRhqNO&%k?$vq3lp>U1CxT~SdLxc%;>V7Ai5;kj^B;2>JlpUTKLDr$-G49wl?Xbj z?BOtYqL7+0BUdT{-ZMei;sI$xkrABz`2TDQ+F1Q(zB;cKx3wG4mbHz$+TKEA@0N_< zS2BDgq)({VaBLPw+$LxMNBht52T&-)ZOMERh_uT;4XW^ap>Co)e3#L)i=w?qAHzA$9VC7Pdk{qc zrfxJY9$o=RCxKk-?29n<P%$Yn#Ev!s=TM#5_RwG_f4 z`CSbQyTf`5{LOM4dSZv2pZw44VSJVc@~op5C4W))hdhXLWOft$OJv#rVn+~~?%O~@ zEbT=X9)c9FB%x5u)f7Xu$-}shkCzdp1j?)9$5C9?ey*Wmy(??{4S<>>W3M-Yfvoyy7VNLI z*TS1uL(d5j!J+Q2M@NDe)_BNaLFXBu^s8e_ff7P%;{U1iC&2CYL(y0tRSTeNY*Em_+Fg%2LdTNDan|Z(^R-z&I{mzPZ zLcpv<_2W|a5f_82tfOpkf=#>dNp=WVjn}p??dx5+?-lcxajLUS^s|#x2G=5F_Y1Z8^><}V2rs){g zM#6h?ADI!rDG^i+_($^ctNrh;F<)eme10J(w&~>9Uq!EG>gZdY>-xO?Z6dY3_A%v0 z^(%K7RUD?@*PXy588f<0)US)3o3~HtWDUcC4m~MXmwn8I&~F7O9kQyzRN^3&$P z)0N|_b)8mzDn85iwJn0weB>qP=({I0=r~bpQ^`VHktfc6ZyBB0to|Y!9(_0}+AXvMnTS6}_|vz_7UF z(i`Qa{#Ayj+a4V?$1+FRy!pDR1VN*|Y;vRxITst;xaIhU;Rj)5n@AU41<|~sqP15x zTX}_QJVKsGbmt%SYV7>C5BtVh(=765#LDQF)m@p*vkQ8*v;UobYyIvXMbUimF3>t$ zqT}Q5EQq}wS<@j2?w!54(s5yxx!Ew``1MA6+$HVpm#$=(nWM?Khj=k0&RWm@;!^1H zO97uJkuv`)2w1R8K(q+SAQ~E(+OFCrqa!0Qg|4u?*zqn}!?y>Mfs=Rhr2F44dVFnJ zj8vi@h66^@g38J)e$hAlEV&ide?Pav!J)rXZoHt#M~2^A+y)YiM<1r#N#?++EymJ^ zzX&k13N<|4v9DoiYiWUlYOCAr6F%2nnx%52E-E(Ar0OkMvSZe#GE0J)1Myc<*Ej=y znE92J(r(zOc_A`Uj>>0x#vhgJd4Bo|Zx)$K_l`wpmp&5w%tLiD?L(SoZIvM(GcJjZx1_su>?JF$rFoTrcn%(q(=2eYD zRkNeIpQRvdZks&mA|e7ia!sU*1FDWaKd>vM+TMouRrRV&W6hFY>k@HEcbvi%cD3*w z%I_=OwscqOO3CK?>Pn2_9)MxL~$814-s} z5SO6C%S)PCotm%n>UPzR6&Ifs6|qufAlej4b*kBKD?AO@mt3qH=`>k%`R$ioXF@)7 ziRlO}hj#hX=SRNd>JhgSjN@Q^P9 z6Og)ynT-b1x-SzgI?ZWQyIiIZj2C##dbeH2iLVDG*&dla3@_~ywKr+v}(RRhgSxhR&_VTmm_QtPDN!*jaw z#QEJ(Z^WKJB1gC!pfUpg#DF;*cEL+_A>xdYd~17QftI$`D0N-k8%)Y+pz5h(+F~VB z=NgOmI&s{~(62xJ=vCoJqkhYSriMu1_($ZpvB!S98(^m-^{Okp1rZkcVul>aG&J>( zcP>72HjI-ytF&6t-}+2Ml+8>?3vJ9>0LA?@Tz*q`c2^Oo|-=62ZobLwa6cJFk58K$EivR>lB z+5$vf$?ko+k`fphxvcC!PZ%Ktn9B<7K%hGQJOBrBXsk^62-8R0%J2d;8RME3Y2j-=3ilSA@D8H)8rg z9Olo^vzr`g*Yyw-+7Xv0z0ODaOKD!ix#giM(~EN*n3i%=gPFaXhsemTT^q1JuuIV1 z^jAiWc@A@r>EvYK*P%=2duX8+L@rnA#)9+bkBxiH-{~%eiLM;t;-oF~7`J_zaA^|u z*vdg_7vT!^k>OCz*_wr?K}j{TfBb{!`=chILCnB=E{KIUNlQC5^>!AU$oR~xdsgH! z|NhHC*@N>kH6a|9k2VI03N@)O(QJtGTPvhq(xyMERxN1}CX4(p+E2Z-ST;kmS8vCW zg^cUN4+%_<(eSCTJLkrTUt9s#h=v#3LSH7>PEKszFHV=ruVVW+I^gSWzr^yxn%$Pw zEAO0i@VmCzFrrJdFJkG1pjc3ue^dE>yw2IHp|RN2jT+szt6ONcoulsak1u)La~E9s zS|-)H{H|j9g3=Y9gLl1Iw(g%lrm~6^bdM*OLq&4S4y-s$^x?i@Gs3;3c|(r061N7|T;7*O0z7kg&(_3_Ui9!{$qDMbOG3dR1JwDoz&lxJI_BfS` zXHCN+w`T%d-1IbJWW+OEKBfn5(9HP6V`P+i%NN161G|JpwS;q)^V20IX*B0-J6E?z z6jx=iAv9jNK0I95Vzk3)f8*DoAd~|4PA694q8E0YY zWZ~uDfc%d;JNMrvBYRFc@s3+!qUAj6)GFC`eO=Rn>h=%mkZ=a02M=n*oc0oye{xmQ z+A@;ws&`k)wt=nNhTm)(|KimjIFpku()%>up#>Rzt+(si%_=24d#o!rPJiP_c6O>0 z7Jm26NvlV0FN9sPpV}{p4ZOhzspO)n+ke-8E^Phs>vb3>S7D*kJMZ6{J$qrQF6X#cgwyEh)!9t*Y^-C9MEAkxQZTMsq4Q^Z zG`D0K4yto=?>vToI;(r{lu%;FElz-6=fC}!)4c!qp|85)>I(|6azSZmB6FM+nu**zo*( zhILzfnry_LM#K7bcyhAtG|YfbQAbanl+Dn2b3fHxK!6XtHa3~1F&{tHbyy4Z&m7%gMHQGE)kKM>z_31DoIU!l!yyHz5HhV z$IjGPsMv+9URIRui%hzlFVf=Vq_MdfQ|qIQjE9sXEHq+xp+@!W&CEYBK?liEAvFbU zp4Dqqx>R@N z?IWX;?UGlZ+=U{4e^@hO!inv`e{_unhrVCDbfiQ4v2XWoY}Z_$<>PrB!A|LRci%QX zYl`Sr)D^Nh$6^#73`lc`~zEQ$cqCK@_(v9>wc4 zONH`odZ-#jA=l5IjgPO-i-Y-j>fB67oLS%pt=?Q4GRzOejHB^_rc3xO%1wH$>Y0g8U5H#_p;n_ zd&yzaa^ghef>POjs>!MPX$RMSHKWgl$?In#R|~1n4&Do-a(eur&{^~ALW^pBU)O4_ zx_H-C^$6c19}~3eR>}*|9W+b`CJW%JtMW$s!z1+rmiFdE9c*JUF&U>0=#Vm^7YJy( zm}OEg_vGY)i0ZJmx$sMh5tEOP&{zz0<28wF5 zd+ctHwY{vA0gr;nWq|X~e_OrXSj9fq+Dh*A?gLjvTI)4S6cdzr6E@2Xd7WblKgTNK zGFmYDQj_zF{*{57abn=FAyZ))Z^KFR^Os&%-Mzy}OV^-UNg22>Ov|Xzc2kY2@N>GZ zkh!>(b(py(YU@S%p{znyatD%L1$NdSmZHo~=T|yyMSy8r@td^yxD)vi0DqO{QJ6HH^9~=hX>Kmb*sxSWh@v6maIe za(ZFuBoiuHSeKa@s$A%!X>{XNJr8fHW$oF(5P_k9laFsRO6n%Z(q8LRIPAFh)8ncK zMH5)DL`1cYix)Yad2vQVLXKvMWaA0bqxpv6%x!AZ$S#U?&RS)W5>>H^y6wr+pFi4` zJt;rj*5~_#C(A58RA=_9s8E~mjq%!6qwEBpE#}z~64s^_eC6dL8Q-TYvdj=!)b_xI zo<7UvBa^Cj*X9E;dADEYiJZSN_tL?b@pXPp+?ul`IxodmC2rIm$xv~m9DElsyJ^+s zd$PJKEIG2;`%%N6$#NZWD5(*)OA;~aRLt3Hn8m2un)>K@+RY=c5@g#<@*W!)ZPSRc z*T0hS^ZJ_!dS0NbH}kk2I&`t(WqQos{W-H0(IUkv!;M$B7;ARo+t|GG-o3UN$!+Jd zRX?tiyJWpWLu<#{g38J?u`Zp#dlg4QoiBA)Uqx_+;MR%*oA(JQR~hPC$L`h9+^!oN zpnP1|e3A;SCfyaMZ)oW-#qD_c1S?T`2EP`9ue&r?>G&n3w2YjPogu0gW3H8=GIPS+~rQcmHPmz5;9}ccBCz4 zOMBU#tz+24yD_@r=<>(WyyR8`hVZRP)oxC7x~Wd0uL}l$9m&#HZWqo|}wlWqn+d zaCy@78kI4v!^=G59P4spf14C_|;AIH)=>Lm0PQ|GIHeotF% zcFXOyZBt&@$$2VEo151*t!A<<5ytI8L}AR&BC}W1U%fxwNO5(sgz7<-XQx>RJYADvHTD zRD3_P#dGK5$K9(E6Qm!gYHRJ)nXgQb^@`Idmn`ruJ;H_Q94og(EWL=H-9v(;A3!nXOU;}i~56g z449ig9x~L3G1uf>0uH>Tt*Mh>h#^-ejzSzn2AsOK~Mxw?6HWn zOXm|V?>KbhK)dqLhb{}A&pQ-edE&h(EUew! zr@fhfZd#Y-o#X{~SBV~PiMz28JR-5_`;1&4UphLL<{uj3bt|G_k=k)eiuX{McaOA@ zx?9hphnt5(x?|nHPl)lG7%3!fZ<`wS?f*(!0IQq!rKI(`Lf7w-=f6a|!@N8ZRo>f& zFX#)dD%)5aNB2^liH$W#_L=(j&0xPTFMG@@5nAoLUq607EMY)tqlrp0xEAx3N7K;|M;jh;D{|_mR=){E4qSDy;wxlrgl4b97?NYLiF|T^vDwc{u&5@Ou&w2#KiOk(K4{`GkdD^vQ@65mciq3$<^YlWT$A8Y3Qv{H+akDG{xVBXE{CK7lRs1U`wK%{?+wQi7 z(QsBodM>MJAE?y*kaTj6Np-GbJ9gpMFQ_Qoy@~H_6NhD@EYKgoB4pW%2@4A#-T-FP zoqli1-rydSrXv2BC5I1B$XxcZwk&)6eCTW8{=}QjL(NZ%77Kns6urE>ykD!|5DfkS z6}BpIYkysW1MwGJbh({RDHYB6>n&@J^ZoTx6W-;13g7tzEzBLQY;3k6Pz|n0QyYM{ zbip-s?QTS>D7Zmxqo819<_cwmmzS46{ZTCW`)_paTyGfP9B=1R{7}zX?zdON_Nr?29?_%#-6^z%fpk3aT0L#M!CA$ z+L#_QfihQAxLZSGGW~yszUM=@D3t!8AzEnLVMR;9l_f|Kglhx>2j}9`8WQq>(fO}z zZgH{Z(iN6CUp;^UVEh4-DjA@mxmb%?4FL@IXzE_Nc##Eev1>WyMbeJHKE&^D51-~+ z`p}Dn>!Y7jetokm?WB3HcMUmrdYw(;VWRW|@LT=kia#GJO4k$&UjFzHdQ-o@VoupA z{Lf!I#7CCSKVMfHmi(Q5-sbwBpGb@aN^Hp)q0WKf;l-2?aJXT7CcJOp>%jZ%$JEoU zk-+TnDuVb(O7LJ%*2YPCbRl>Mjk{AhZb%DuBt>KPke4!UJ^zkwnAzft<9f zZpY!!i+s<)%9^@k@Xec)vL~?Szz}mE=)^WDLu0t6G=5`-mmfZS_|Ba>1XP~&sTiVS zl4Xq{CV2qiW(`M}LT7W_<&xfow63YCsW%`(V&u>dZ%$F{R*BqcAJ}7w*$_0RK=m_A zQhX6mwlVlA`8=G)=yKc2% zXdI3aAp>N}hdy;Hl+O2bcWXe_NG7&zQlprD*26kfhqZ#1QV3VK>GNbnr&5@)q?&AL zoS8Xh#)C&4fr*msfNkW3!o?DX;Y|9EMqMY!b&27T1OW`dH8z0ZG$-ujY!F?wB?JHx zdLRBh#@b$+ z8W!P4aLKS2R-|W<$7+{It>sldULtN+QKGG_%?u@k5IiML5K?B3VWO8z5tyH!4`yyt zy@3HW5zVJtm3yUwe94I?RGmRW9-*xe5@-=aSazGT|Ewtj9YQ}>&q>?elgNr#bQ7@D z^n`|<9y?GMwtDoSf3b*C8g9P_piA+3rrS4f-kce^mH(o{*Vi``!$rS~Md`xBb27o~ zNdX|ym2LpH^9Iprn18j{(K5%&$Cp=;nNik$Aio)}uj zuuoH9q{N?d$7^T_xcHclGMZh6T>3i3%wHxZXb>RzJq6%q;N;EH4;?oT`h4gZ<-<|37g}+# zksoat&9Xt`0&_oQmKCjiuSD#H2)zS9hW?kh12cF;ZC|%mgt5bne>^HGYWF?O?~1Of zUKA*aMR&xmIYU&x1nY-ag@6d(V7zFpz>hbZR8j^8285msC-gIf4cp#1NQEWbIK2x{ ze#~m9D4@Rtf+i0H29t7CM0-5E<7_~Rkxd4AUu?i<@K?C%brtV5tn{6rgNQqrnHoC_ z8O2pE{5(@zHEJh5$fX)c1sPMsDg0zbUSoDQY{2VTXPQ)AdA^=6PMmIuk z?dai=8<&AwL;QuHI>UDQW$LAaGk~&IPq(B`zwGN%ebN$YjL1Ne?Syhq5aW;t+l=VPP4E^_`CR-?lC~BDDv$C?TbMKHM_SBB` z@6CSS0*U6uUl;CF0*69Snf+ z6`LG2oT+qJzsY;&Y>f>UYIsgb^PF02FoY7eQvM)?yzk~saeNY03&hloKMbT_#NP8$ z2$he8pI_z#cU$WG36ea6{h81o$4A;Fu}~oH1DQOB5D848F@am#4p>Q1l8;tzU(9}R zIn2rc*mMTzVCK`^&qcV9IeuqL?KXMHBL-|_qHxa>o?qXNMc(62PD4xnalIbe;2~AJ z`}>hKj$b}+SeP(9g$L*jKA0p4df5e#O=2Pih5z3sjpNLQnLd*ya_egB7O z)M`LfQ-@t~z2AHq41LlhsPK>t99{K*{Iz%dK*|}kDV7@p(>67KQ_%lh!+d$}_T2aH z`1`A(Uuh5nHmfN&Wo1K4Sc;*7rk%y@q|+L-^V@5H;s$WJS!S|TS| zoV?2;Zfs(DHDcSG?NR@5z-Yv(&OrR%MquQZjv)pV5yK9|^uch^i+@^h+u0B>WOi3n z;)J7JcPt7JCzlEirr>8)p7 zo*ggtYLoU%8n{EInc3OXMsAM*4@lbo?Fac0=8QxMo+}OTbCE3(;CpsFp8551ZI7W- zfww;^!$_PRiX8$AD2}nfdl6<*3^$*M`@pUId?rekf$kcMPL-R(KMX^QlDy&t<&5We zC0%b-HdQ{KP|(TUkN29yZ^SiU>TLNEFMb7q*sI~Je$fOrbP%?JOgoc5&euI9_v>@7az2Is==H=!>0$4&TG@gU z{8p;o-ripsSz8B^U){e?O+&Xd5VT{dwQF~wWFu-Lz*kYC(NOMGRHPz<@Wb<)OdL?M zgV&%3_9`}Gu-4lW`mpYSbQ89eX+7C@vF2vm)6>!2$2GxfAZ}kNPkasV{5nI*LkT1Y zRIr9&wbz2l9YBuL_#`AR#H}$B_Ry)T{)jIKm>b)Sc+*s94Tq(>KznQjKpNzMl^CEB zZrU5b?@>NcC?pIBx7`uTCM6K77m(oPc|S3GM2L6E1vPblY71B+g*L z!NF<_LD(&wzy%-%P3%}QHr#*n~Y5`?IRZZY5p?2MO*=2%>FZ?IYP%8<5|HCnr&wH>$LypxZ@v z^spkdf`qPyP4x`!afbdCI(m9~?~kuoC_9nxM(~6ratdd9W?tTK)Hc^a0(f*;x#sxP z5-jdyM*&lnB+3n3@vMH-?{FddPZfPN4^{NPDg;Hl3#5u>W-7`ZvyeUt4>pxr_gx>E z&aw;qs$nQp!f^yx3^ipdQY}$F=ES{2?Uo9~B6+1r*$f+NDE_hiTiTP;8+AM<99@cQ zTR5?Z3I@MODg*8-dlG(pW>T+^Y7%!n6?s&+wrp;V-abASTjCNE6A2idgoQrGNd|rc zPVn}bvAL!m#7qpY{s5kU>@f&Kjbcq0vK|eBMrS&Nbvc1c#qW;wrS0WSad9tt7f9@9 zEXgHL1r^}Yq=WVU_KXGV$K(o|YxW_PMCn>XAf#pWo z9F<;_KJ_zWO2KH;Ju_*%I;pVPJ3)F^r00_q`{(vtj*1e%JrM=m>deK97x&>gDDtZD zpEF0W!!{rzal${0s5jMdR7~s>>o8jb!`2B73ZeDu*FSK{(D=P_XFZ=fCJ_9I>}^S) z1>uL_kqSOiuBq~g2WzK~3IA#hk?zo@LC6CBe5E}{C-bj=QH&%Frr%%x|NIHxI0SpG z1s+HuqpKQ%MY~7QBBl0uuz=&g@BS{2M0HfF0HA^9PS{53DS`R)3{-2W=0b@%1auKjDSitn+KcYWuW=YC9>H;WHkohzZtbvP6f=&lu30;-OcuvR zMFm1t?}$e+)9&sj1dTcF-W?3wk~5OifI5DBu)J+T;fk#D*#`I1UZ{3P<&tDrST2^4 zUR?+S2@^2x{)S@L|6cT4KCC_4Q63*cYz^Y-7_kMMxqEjDs<`62`-CsynMUy2Qp!Q- zcaXR$b%HTQ|LDh8K`5SFyN^PnN>F5jW!Qv?E!+}}H2cb9LHX6QOQ1&5M6pEDHi%t# z;pbfbrtoXhO<^A9D&RpLb)p3mY{(VQ>aUD>FsgC(-`#~$EF66E8^Atw?yN`}!^a}_ zb0QVRii%$IGD7@@W9-+rhn<~`;^^s#qe!r+Ign9Gdvv?M$d&TyrYH zk0Fe^R^(@*!zI#Xk1PoiJK*3y6>EKa7fv2I$j`;m6wQ|V;3JhBUye6Jfl9If`l*&t zpu+(P=eI1-j;I)LU$$k4+m zLAUqc5y61|4q!So6eKri@x;HMko6%6zMeHdI8Y!Eg+I;Apq{duT~Q8RrAO%_6k@P|A~h3bR-R9pv!s z!V*Q!Bly%LQ~?L{U3dVpesr2lVv|V4zmTK|i2Ct8pT%-32zR8F^^m1L-adIi$E#;Y zB*Ct_hx7p4y?WSc*PGF~3?0Dp-kPD$it1BKSOwl z-H0#gRQcWQs>R|>vPDR0JU$lr2s~g-JKehZ;^7bOBAh`i9Q@Ca7Y3LJk1l*t_(3ma zEjdW2{os&OzToJn7rXXWO!5%cbt(#AV(ksfO?S-p@&QYa?cF5u#Ldc0MDLvU>DijF)x$QHy1Z-b}#Ah2@; z?2YaGEZA|t3oS*mj?}7&uS3m9iR9x`4Oohk5b?yK=F!4K14v=+?%{Z_I5?k!yxE&1Xc;HkK8053C_VfSdVIsK;+4_ zL2qdW0f^)677ipFaxegw!Nda^&Sm)IAK*U&`0dYz5l3?BOrcy$9y8Y8+8 zANIP;BOHrn3Ti*XnL|%yCr%=^H34>mDv0SK2U`+!Ix?~ZR{*_?H>3GA2uAd&XwJ!P z!7r^ss!SwuoVI)X!20co1RjH{iY_w=ix)3G_Vw*z%D(+oQzGK3&1Gd}lt8p^GgX?K zmW-s4NI#e?2C<}qZMG3$(;L|zNa7*6)_5>UfCm=1?dX;e&>)VH*>)%Jplx8q66DKH zA(_R&%mWuxe=<|*@Ux~}d!Jkup@g7J4M9JTN)5M9eT)ku9YPmG#-b_7+9_30RJ8V+ z8@o*%2wqnBYZcOaqiEm}ug9^&bd+D|m;g9@izws1AxOYrr(MF%45Sh{Rk?K`XiXUR zSPb>2D0V&5Q5&o zEb*oOI0MmAWYNIb{T)|xu|HmfG;NkcXu6bopfSio2`(-!emx7s%dMZAQ0GPcNNA2* zYtlucCaKi0UgAQd?+~n>!i+eG0UQY3y3lv3M>p<8N2%*`EebRIfgC7Xv!`(1?>~B^ zLt#cT%8sKdm*;;>_fZGpHZ$X_Y>~^v$jF8c2Ro(i&Y{5<-w-~-NeRX|<5j79yqmu* za4@(=6jGYS<;}9I>?FI*&hd4f|JZfg5QZj3l)j|fTKZ#yzq#B5_m0S~$V!+nL|G;XqqZif zc&1&+O1?taTTrB)V^$}9g~;UGn0fR}r~`3;D0sb~Zyj?ozqYoP{kizUXVscb;xAEi z-paMG=QLK_(>8q+Zye8a(uamCJ}Wo;{CUz&;h@yvwDDYVBj#*_zluIz^R^{9cqbNP zuS4)__G{2le0OPnIn@<4Aie zYr&~xqq$=wpRzTDnJ}Fj4s&mGi=hY{{nVXheKT@N4mRJ_ohpDDc32w4TGN{2ek`IC zdiJSA4k4kXiXwm{*=*~ZmF0QXrP=7pMolG<)W5i&cm;U+Ch(##Qk<}x@=WNAAl`&J zkd*f!7cLl<{lE%F9Y|tcE;q8OhhWKIq>xZ}!?H0fHYoWRu-p+qS^II9A(A*7J$4w+ z8kO1xXo~H=OG+$2iF?g$6!=)YJ$M_%y0%*pCcju0$znD_Z;46`E5BB${dd*fZf$fB&bIk@h6 z(ccFH+dFo<#1_c{EIU_Hik1j!n3~3riXNLv(-#tEbGv|%@XG=*#7M>7h=CY^)?wiY zCUiFZ4^1?~Lp7)~DNz;7{!an&19Sq{tQK5JDhi0|s8&uvkBy2eZhtZ?%ljU`$Ad(H z31*9XcQ4jWN-a)vCJN0thmRc5FEdauFvKfSasFad6a}v+0>o;cJkfW){Nm|*taawR zC%2z>c2ktWSrt*T*;g&l!Nc}NMZpu*kzi=tG@T2&V)lXaIuea_XjpUL2#IBd=GpuL zYQ>Et6A*_QH|J9lkH&i?Bgz)rao+uVH=hU!)jDrqNworoMH}x&rM%Mlw zJSK-pD-=M7tdo99btP(O#*irF)vP&B;_lr4=21YD9}xFbtAnF z4&kie)}3d&fIp~o5(md;3B71o;9Q`$DHCVa3se*kwRB3AP&eruIbptHt@VaZMYz^T z6H>3@Jo&ELk8Q?j(>3FVEk->qsu>v*)MpNW)_YeNw!$DH#VasqYXXXV2n1FqyJU(4rD)HdQ$v`{LIJE z35{w2f)K+>a2*X9B!rvC{XgE`JTAwz{TjcT5z16bnrKjIAQVj!nrM!qfeJ-b%8;Tc zWr*fNlr$)%q@s{WrIMjhqms~Ih$fkS>u3+p-p}v*eLwGC@BN4U*@o`>zOM5+&*NBY z9c!Ig&^TqQwl*uXLr6t6?V0S%Hz{eeOx7%TRtgg$n?g^$C9ZILkI1Zw*Sz0f9@%wQ z*(CZJZV$0BmS1#b@_sKXcxuvAC?h9l=xTg88F_Jl40v9mIM{FRb7@`#4@x z6HHYH%)t#L;+LK|XR3ky6dlTHn6rRWk9F~Ox|Rx;!mX{h-K{EHz{X+;(d_HV%nLOw zmC`qH%-ulL69ZSWEExXw&8&q>W4xejw&~cb4cGU(xrI8#Kl45D%ovXqwi<&b z9{v%VW`)2==}(>Gh=Cs2Qe!b86S6_&>GOAyU2m}vC7SZ#!q$a%KykI>c>)WQCTk9m z?lWpSVCgbV8W^Xok2S(U&N9D!!dpj76;fMrPl=jI=@#xwgVEz(+u^bzBgzH&=aOe_4;H{JVee=y)>=3w55=A**Ic-4?tS~@0Z8cj{L<1=qQ%l1pFVxM1NRGQoZR?mC?)5wXa}9)4%1!F zZgq6i40TiHnLC&cER5NRU<%;H{HN|on}QtuCSS$m3Qi`CdlNR=dQN4dUVNZZI0n}> ztb-QoYUvUs=|$8;{X)A~hU3*9 z32FGMsmsrpdmr%j2G%ZuXS@`Ta$P<~)W!i6&}87;(zmKFtmNAb-mJ>}Tx0|Xg+IDZ z&OnB7()UmJLXYGF@T-+E9I)wprchl7Nr(*tw%7$N0i~-`(k<@CwGWG7@k1O8uIg%J zn0}@<0u$qylm`l@3XB57kxBVR_#=vM8SHPGflX{B{+Do5vK@OCuUQT*XP~o$3)KII zQ(oMaCsB@*t2Cn>omGw(TK#sYDiZb5_j62p3w&kP6`mfz?$=g?OoA0GUL z<$>mMHcagaB|;_?o`+UgeBL6c9?NB?nk_1+Y&40mnxb(GbYaxi2k?$|$Jo^jX=a^@ z7sL0ge^n0b9jov~?Cy?Y{~i}7PKI`9*wHGw)JfAA>XMxeI)F;Z`J4DH{8-8ov{vXw6auhc-ST4MWp=a_1#=}zb*U5jzHbdwdKoh=GVvAb0B%C-kD98Aaf> zVGVJTOYPp%3(-qZ;3l@~a@5xK`51Gbg`yJi#u`Xy-m8@2 z+JVnZgfKR)j!&PmTxLZHlbss=J!+wcs1gTI zjM4Kfc*hRxlyf3)zvsBn%K9zpJ2Y_}U~SUfAJya>{^>o4^#Pinq8M|nHE_t(xbFJD zg18;%&>w$=t#{!5-xXSas<_1ef6}f0`46^*wsAbSAV2_sDNtdAsscy`l6aDr6)0~I zDvH5DkKvos%h+HXfkAL_anZg3zgZg>&_BQ#5#3e_E#wOdC%~oJwqGeH0MXyN!s90T zjvB)6H*++AuBQhQz!*QuimNb*BisP}`n4KZSkFvdf6=sM4{*bcc`i}CUdaE%E_EFJ zI-r4>aBz!-U)1EuHTx^kR5MAM84&kP-o8dBaz8fHgpD`0rtfs#hMDte-BbT~Q}yvT zMI+t;Di|qGl!S-VKpn2x?(d$?#nz}5wHrJKNUP}HJh*)KjzG&qhz*=FRQ7)qLYU`^ zTYK!@RK@TROS?fea^2DD>S}>FB;qWPZIdOBeGmTeJc30cJdj_B?X_|oLd*f2_B-(K z>_C(|)6=7mb)5}KTF7@gYp_c$0nW_=^#oH4>`@U--rd7L1*sN)3qN8OK)%>B7pq_@ zAav>V%jEd3I%8bsYIXm3?+@vdXcrh{i5cV#L2E-rIQc+hTbKuMDX$u)>Zo!pI3yXj z|0r3M;$vfH<1$@cD6n)789Cc;9&N+ z&ysPx>w~x6c;+NNL6QYs(}QQ9dWjjzfGMTU1$>+_;IaC#OX`Ob;m}0b6drZ@LSWN) zu!pfTz}Bzgqog+i=Mek~qFfj-@VPwJ_~jf2y!E3ty1N&Tykdii)WsEA;VIQFiHYQM zyyreg*_n0UUc#NN{H#>b`t6#&;)$)I<)v(RPy}(o6YuJ!?pz6TE8unuDXz<<4+^`C zv3sQA`2z+9JnshrV7p8f?U+?n21C}Ww>KUa(#ZJb{nI;5q9IDQ`1ggSQs~lind1Gg zl6`iuu@ygE|8=RIU0ULka;_b9y4JVc#AtlwtK?^^1j)*Z*IiveRxY%aJ|-lQ%h&g( zLbTnPQ(QW#GqY|)3g8kI@E}+Fcz&1vQ-BS~+e4yqIdG!dINKLEgP6}4%cHZ3tXK{V zBwC-xR?N!un_guHO@^uVu*nieuYa$*Fkn^t)(|7@Zf?w%cU+q{Cw)9wddi^X{8Emx z(=2e(xNgTKtj?OUj%{DRefO+xLG1~+(4r6w-8rTGdjsSEK;WI6joSww&l>R4Egujx zu!7l3CJaI=IN^+) ziPW*?ThDr}czY&0iRRt7;g9F6;l8L@WdJW!RSzN4L)m&LFU;tAFfJp5%SgE!1>Crd z4Kcx7%QyIU& znxBazLxOa98uDnd6FjV~{aU5Ia$qo0CR}4@bKC+%1^lwjJa2y4B9T4YcZH(8UvB$z z-j}(0&9;f`7}>Xa7NfiNQF-|_*0iW7i+%h-3@~*zzg|hLIqL%Vu2U1Oalcz;Y6+2%a?JJ6r6mV0LwjWZ@0~Z&}9lkS)OTgdFk};=Z`Z+=&2y*k942Y2` z9S~Z`np=t9i1wW)=a=+V#UU}P_}9!facWq zJ)7rBn55j8cysrQ;qPHrx>!u6KH4eg#Nohd^6|j|Evw7bzeO5$Kdz{qG1{kqBunRA z>7l&N-$s>`*1al1Oy!9zO<&d@Ka;y_r8>Ut?HNhRyjFIWYyCx5Va;f3E3aPm_kC>Y z-nuPaUH3#nkUEBU0@v^K2e2dnNm@b z{9i`d-ep`GBBML(5yICq;?FjtR_&0a1!Ph@uk+8U8vagDMsOV{0y)f)^?o1e?`l10G$>tT_@q4`pCcVz9|zQA{=*ILy=+~)JQ{YzaH z&f&aZ3DgeTb#V0C?0t*$(oA=uMJ4tZnoH6$hJ2rdU2b42wcVi6q+!Pi@6Rhy9&aa# zXy`UV%v~F!p~$OfLq05OC+fENEqG){MtY8x^Z2jYi{Z?_;=iNIqQutw z7(Rz^6yi*Aj_+zgIe=;c-A%1y3r-47SfQBfZLxF~W4Fw*ow9xVcE|@35$S-qYl_64 zhFnGcTQFP&VU&p0u}9+>sx#o_!ldN}{A_}@hO#;rt^;)DKGbW0-b}KsJb70_W~^~R z&LHpRg2zh!Gs8GB^tbC}HVcN|vFS`iQS%(tH6Am&4+j9fgR;@WL^EQ``~YZd8~xM` z1r0v&1I!SRq}zK6kHott=Z=9=Ag8b%jop@&L|UsL&I3&^8Up?NEQ|m^DZ>L`4IAzj zglR+M0%`||?>HZVfEFaLFe-2=V3h?A4sj9MdCfoF->I=c0k}bZ3y|(Ayl)^Vkvs|Y z+9Oq&P^(hIbFu|rIt1MZ`6J&xG$KFzvF#LE*~1T}zg?8MiS6cggSMhthk6<3{zno9 zekW%oPh45_P4Zht4nh#KY-?hJK>h9cIux}nd7byXJ8ok34s3UQ&L!81)kx@NrNUMp zrmY3dg(&rRwmxfb7lwxX5yE8Z8nU1u1peB1`x#o|7kN83;EAIFkc#{fU@LU?10AOb z<)5?zD4*DDai+5&s8?pb3CA zBETuq352;gN>xdC!psM}N{NQ0xPOZ;tjnLs!vpdPKVTn!p!74a>sMe(c4id+Z61rf zpzgB0VlrSs(DGa%EyPcuMb7k|D<~*~P?&i@+@y(b!6xzujhaOZJaFbAYh4~7U;yAw zw9Y5dY2UQAW$F;;tCgpQT}U1tbC)vY_J3@h1_TB5$tmKgN>VmRf(X)Vw+L6*?IvZnoH)KuPvP1gO zCG~Ui%exP&eiH<(goTWLO1#>$vhNj?APeD6pJ{yrR}!gq3+X!GizMb7X=@*qwn&du z@fppA4{o107ENSGMqlv-BQ(5FwxenWb1)yUBNkbJ%OQLF1V%J}SCcr35cI%d zS2q{rd>$HFH>8Fr5P5(VDfu69dLi7#7#?xS-}XP}D+`z$Dl2IiFCTE?1n=;$w#xEJ zFK(vt8eO{jsU%iQAhh|NN@_Vf#<9SMe5q2X$R}AlFxnFd2?>rZrr+h`!vUY2#G!pH z6A0A>6wYS%+HW`n5s_tm9;qJ{sC^oEK|Q5v zba;UPfSL<|zMaH8z_rrrc&MQ41VX$(*x^0CmD}jS1i-0lCYhW=yu!L8r+zRS3pA zw-gV2wx{(Cz?+@HE;$*>X}a>gC(HnL)^!($?BoXOww^P`OV({jMsbgAW%)VvO)lq_9LQFF5Oin7^P+phmFpr);5}!TDO-*$ z={ZvFAvBXTID}1savlKNQdS)q?jR=VYW|+)+=)Q5*5cs}|8UJ%+J4Ix76uyM$++lT zix=N??2$B#^#goBNCMSBI5a;4ESOm(f{+kucDzV8e(F^1OYPGhR|g-lZ{O~`BfKOi z`Njb@Hvcstej}e|jG!-NWdQbafDk4~Fvk_p6X|Z~vA6p2a`~ zIFd{+`KSr0FJ(bt=73!M$vCb4r`ER3_q=BTI}5@%AKfyrmn5CWZe*H_B6cN6ociI# zOP7*p6H$DL)*EoIUN$#x+jPt8*prV3&|+?sn7dbx4OBT)awuvBb(ip(RnAoAJydq3ZFyWfA#cPVfBYtwb_`i+6CiB?3J^TmsIra?VLEIGH`D< z$~ibTbGEvm%D%~?{oKa|SFN(Im2~#Wm@W~%f0mbq_HslUa%uH` z{nd3=leUiR+}r?6Fq{!mJ&KJ-E=CD`H0E_)-gcjdlOq7l!d6^tm4jdA<7UTncbP4J z*(4zn@dlaTtwL~GyuO(Q-0-%fMT()(W~m4R3OfvXxz$LO(`m=mT;6Umpxe+NSu6xt zVbO9)!A@iFW7{0;)^vaHoCUcEO6p*eqXb|*4N%n=0<;>BDs0qF`gFMyzfw`U+GsrS-M11~~&8)d-*If-yxwsfiG9OX+Mw{y21l|zN5-5LJi4p-vIGKQ82gJ!EP8($Spzf)KUquJFovOz`Y_06i( z#xYWA2A0Lj)velU*wc2rG@JxRKdV5X;#V&IHV{8Y+B>F7pAp zTveWrc{_i97JSuI>kX(9nVM>QZ|m8{dxDX-H>2)aXJ_!3wRd!MFoY2IJ-gGwV!BKlr7B5*u;v;M#Gxad z2cgP`2S<5X0h`gHbqW0mzc9Mn?JB3!oSFO zMlH|WNowBghH(>G?{OnB*^5nJg=O>ZZJg$5iA6J|nT=L@_t{KkGsauKG>?mH`}@L5ZGt%TDtXZ*?@=XW&kOM znWz{$tQJ{x9m=uNH+0qh2I+t8@L34aA=C zWSpzCJDwBOssu0GvoY4MH4e+=!H7=JI~IbM+>1*7y2+@{ftx=9!hLC13%EIyg}=;*aK1UERp=+gqr_=xF!0Nm$kkC%UhZRNd zWM@voICL@k3nY@R#EET;VDb84obl}J?2eo6R(Nt)qONjL^&XT4zv7SQQVgvH$|&^l zqS#{s0c}@V*@`D_PQy6}O)ncoBw{1t2b{hZzyc|m>N|=dvo9rv0OOt{nHxVWQ&(Uk zgiiwhXQ(!T)vl>`97Wv%0Vb*4APBfxC}5V~?(!m$gWd_7aTm5z8mc=!S>Py!_bP{# zWu6uI2wb%?vu28`490#2i6k{XcyMBwyq~hom_J+1jN#t;`a<-cJTaSt^;4# zjSs8dyv1di#hJ36!F;O|K4n{K%U z$GJB33=q#LC@2t$hSLNjl;w&tOhr4QJZJbl?mk^bm@;EITD-bgOJv~z z@Tf%aKotQ*X4Xr5#xlt?zkPf6mVggglK&9vhR?^x#|;oC8i- zCJPX4yLT_0)WjuV%FZ1c)5&v|PG&x%R!AS-1WomBC}+WMc9n}9d4!6TK9-hUQ>BP$ z_jy#;)=sVbQE3I-nr5&ua7Wwk*axNILK*X`qHFo=;;tKxoDzV~LS)OEHw)U$F0|8Z z!&cAm1v%*ASA+MUL=*I3e7kq=rV-!t4|`q}%|ef!M6 zF<(D){usI}DzyL#p@4dp@8QO$EuNO1hSTlM=iB!hlxveK-Sp!80MbA*vgtQ!G@by$zYN zYx(eTK|)x9+{=b5Fowh{y<2gsQPwJLlFdS2ln4pSLlDQnRCV`{XsPCbzd;>p=`~dA zg-=rkZpt+CZcV=7Gws+qYE>}PLvvGjrEI)c>^rVbM zPL&;ip;fT^yU29n{ZX^<8A~r}=b!-kI;Ru^)#BaVIywc@f&|1WHE{cMFg#G^3i++w zZRLZfzF*vPTDaYKtI!<*4Ha2{$8ITz_xk57Z$IEe^0Y?Lfr);7jp;bLo^4GtcmiE5F zJ(QHjz}-BAPX6^TbVKlw5OYqC$-h&2WU>>lT%obb%h(qvPxkKJyLEiy6l1uzJZMxR ziE)_^|CkM}2j)$iHf7Hw%?~Jc-KBbmOy4bpWcdh;6HGHK{ z6b|`Lt6hD3WB_O#G)ORo2!U3A3w|mAVuS`lEF*_FAS@cA#v72}oIxeuiH!~)-tXeY zxpB~}1A)R`Bw6A#Ph3Al-p-+R?69ium?v$j;QrATnE4v5bHUG~CI!=~l*NQn12~re zpa6sYs?iE|XK)Y2a5P{8D#vibNvx!l!e_XQ9q-@Ac6KfUWE%+V>@|_(;35n6>;~Jc z=u|nA2ApY71!0z`{8`Zq0_sh~RIcF29V9+LReg*^6NI8c{Z@WQqOt@d*k!Q^z6c#( z4r`T@RX7jdV0B8DuKsmCng@-T;cWonrUabu5)C(kqU@PI>aDDSiEScx%e<2iE zXyf!U-*pb|s3I;FHN*JsL6H4puqEoX1}rWx061IGUaRogk~Hl7SI{W*tp5bOZ$C~-Yo{7aKraI_zVs}=G;6zX*|7Gg=}e|$XH~tO;TQEMCxsEFbiW-q?!YIkGS3<62&088$**AljFOnP~1j|lqubK3+CkM&4R7F3=95Rgq& zeYK`r2ZP%f1X1DoE)BXUT3QT=VRyBlW>E4Ob)$#lwXkQ(qsh;kJd zgQB|>_q(n#BDX#ZM@sXng*YN4J|V5Iv%V z-qE@g%Z|ovXfg#4GfHHCU=Q2+=Z90_#n|KFrVI|kBmnwe#;$VUbBBMKFAX=4nq7SX zy8X`p^bBKy=J#8!k6irsSEE5P3_FoJ0fjK^oLw-Isq$;IizmbGzU>(~oJzAGE}>=- z&&0YB4akc~A&uuEkoFbO4&*h1$}Fuk9EQP%0Vb9~5<@sk=K7N)1;_A}UDr}H67`>e z;u1B28a}4yZsa{tDz>pdUxuP)6xvFf-onJcyww<2NJ5gHH&_e&W;)E-LBbu(q=5Y}s?a6z$9a+E9q5 zSV$6Yst50~``DSc;?YH=ddGU?Y#qdEuwc@(l6?slTXN3+?yo=5-ozw-Nr}OvFq41* zX_r1s>jV?b+H((ss_#3#g31w?&z1wh07`FAb--UgSmm zQ~UdtTMBs8WBNCOH-c(bw^6GMThc6i@ECmXo4gokBl4jXO`OG^P@uCUgFz8eIsWg40a!NMZsC^9HmN<4x@nNAVst@9Y{_643J!hJ!AZDp-{j zE3RFyKphReneNxywzjr3w2pHg71JqfxUbOzR&d@~myT_O8dP)>5g|vQpZ1`ECwd-8 z(1c4twkYWzfY0hUI?e~qMG7}CI&7hrQsy9&2v>dc=;6l>%k9U{)0X$ySuX7`;Cm ztq-H>uU_bRnvhl-zZLBT5aMI#0&qxLwBT~pE;+sv9o}bXa(=*&2^J|UL9M#sJsppY zN;+XZViIZ#T!^48y40MHZ|neAmwGjlQ({66Yy4V1kB|y4iIeVGYJ5Y#paZ!(WFV&i z(x!#zEdu<6EEoqwxRX#Cp8lC{NRRae%#@m*0X5* zip)Iv^~TPbe=B}UT3gjxDlf;L8*pCx$g?Op@(zCNJkVNL@e8JU>e|`5d^&Y~^Q!L9 z`n?tX{U-5i-#vN6=BMOeQ?|1|#tRG?&2$2b0R~w8@pV!UAS%inf)7-jOXL2sfWeBV zFhJ09-l8`8#lyky`8GAg9m&B<1fO?a(-@jdx}&r+;}zKjcx?)Xn-H&VzmH$S2D;GG zxN(jLTh{<7I-?}!fIA8@^OE)6r4l6RqYO>j`tnftBhS~o%M_t{{qn-oOfF5IIWzM0 zk|z4i)9VD#dibMosx-I$fVx>5_b2BL#7RxZmBdW;m9txw6;GNpsb(YF28fE}?w>;q zz7jY${0lPNFQhbSLN$2^TeCUa>y+G!(7|(-vA2mCa75#zLZjo4=QuqR#(m3SM>bwH zZ2&ua3=ut`^MivKV6=oJHrY&ul!)c@#KKk4@&N=O;J_xt09ucE@QZ;su^3(Ek~3dO zfjs_-v_(NS29^B);1QaApz;qqy;OA5Eyj4yP=T9sRv3 z@=)-z)O+(Q8P{LGADj9+f%n8l(MioX)&>pz;!2QsNhRincT%^vNAUG z;^Jyg@F!W-@(JWXgSjM(IZY)sJLc9%+5+BjB|uqbya)oePaMR?jT`MDV<9y;sk#|V zMXd$&5C5kiK-FSEdN47$8pOd%@zc^=H3;}Y&js-vXzx?~@>74tgT6n>uP{|z-OBrl zRLKFYL8rg9Ei|Kz`iROEzrljHY*X|V#Ml!P6G_)UQ*I+KO7EO^mw6ZEemV)c5spXF z>oUbCl8C`QEu=af?A?R?u=Ve8z^V~MbxAr^RF|hwY0-6~!2xArtofe)&n=#=yq{E?&G?78_c>^D5E@$yEgJUxIEJCOoE@dw|qY>zX_%j{U333>r4A#%=YU)I9i$*mEJ z@)YS}3EKxfnd(FUU;8lI4Z&CmL@ekcRRyLFR2ux2H&fSak`Vx**>o2LDK>MEF(9+b@i4&+| z>4+r3u zIG{>!gc06|ofk!))G7=G(&IrxE$ib{STK5@_W`Bq8oUvWo>=3gR;PJJnJQ4?ZfIoc z`zJDP@VrC7=M@xLf?t;;QQ!^bdE!5WIjcMeV@dGFU(k}q}hdr25+^PqF*zluh zI#YH1IoQ;m{?Gf1J$e_Bvn?yK?8z6jqCw;zwtTrDo#gX~x1@LTLdi>;jt{c?eXu;aVP9tPLeNwX{<>0MZ- z4fA*grNc?zar%a;mZJf6WEj{li=Eey!Cu_bG6%v%NUs)yBaN+>D^j7sDW^;0M`^?! zO9pGH^6=@2CurRCpp|9TW#l+4aeU0%3fnd2Gq^&+m?b48sRRRHEBz-Zg%ugO7VF0g z#Ucb%Yz%F6)Fjb{0XYc?${(Wu{PWdqf=6Pa7*t&hqjLC%3Ys7`h9pf`pY#6fN79l6 z{o93~@Cd5K=lQOi7@WhRFkSc)rXpzOi?o$YcC@_BNr))7Vgb<{3A!%W*`b9ZTtqbo zkIMKfQBB78YiSC#Dz~AT%o)Pte-uUsLRP$gemoOlEL2PVFYlgZRhIyl8Go4wiyH82 z|M@e}o^K)FjXz#J%H!i_RlHX{b;NI7~Hs@kw8kt_*jeB(Pbkc7&xS%3c{C2N64ZBXOV=n-lp zcRW#aoTJxie*8D)<)O;vx*ftg=hJgnShh6%^?gj_lcV@Hj@JY9i^u_kJxQq+%+_x$MA@=hRS4-ivTsC0F5AXiJHg#*X6Fff&y!dZY83^7#bPm zojrS;EkM17dwuf1Ze9b{9cGlc7@~q`B9QcaIOk}H;=fl777|I|QS5XfdwiTE#~9)y zoJD_@fAr-JM*0hJ^`NmT-1mVjgs|BIiDqTsHN;_#fElXx{&-2B4M)e1FGa&V)ccUT z2c(1y17Ry!i~Q@SqSIr@C}RGmcOO5>H+KFJQUvJr?7x7`S@6n3lG3JI>#y(n&l`o| zjX+b7(NtVqoO##Z;}Fv-2z$jQw*@c`bq7L0e=RO?(-#q4q5##k@kjp+V3Pa=SE$Mz zXZD}3B{`mAr_Y=zLkCLMHZ&d#1j4N47l6)+u>ZiUEAd3PL756LlVJ#;cb&uKi5R5u zS7HWGkUroC9zw_e{`dkYZcV*MKZ}tF0O&L;6Ts^s{GUj6QLK%2xYz(_y#jp@Dbi2} zu`qBVBf&7$vwyzEAAS-%L&qeB8W8(a`VhGY11J0MyG>(5s1*`0^XbxuX89CQZ0t-G zTiVqUkZ1g4a`p|D<@?w_wi*2Xg{bXGfw^|3$O688-7hLCx9yc(nf9L>f{CpZPBY+* zb=I~E0*@ajG1M73-KlUSxpb-c+gA)7Y3<4|9~kUh)%;!K@Z{4XJjyPSJ}p5!T8K11 z%8|p56Jq?Oi+csgbPEoH^hQ9#pwMZ)43wATTE6)~B|$7aJP9673Ep1zKfOhzC*1as z82b6T%is2%-?_#9rpe)ZkS;vh$(690yZZb0G1lY9i_a}S(Z?qrY3+Z{=UhdE*q?Xd zt~TBYoF+Ag!JbKr2Wsn zy-#|b-)eMr))*O7)z!k+!eZ%^*`WnK;}ZQanZk!9{yQ7}oNN7->|{2zJeeC2`~@bj zC<|$5L>F8(j#1LHz|%x*x#xR7bm25_cYEIlLBa$tJNS-%W+6ZX1(lg$ExdKv3B1Ml z+f~<3=q2G#8Umbg3uI~ZJzzf=m$tVmZ~F8k{bbH5elixO>oI#(+?-b|a>Q}r{id7# z+JSx#j*+H%|NaEu?-}?+9_oL8q6vzjU7}bPI;i=8T1Psz0P~(|Wc18av#{pw!J0?H`#-UkmBId{W;D3FYUhAXu(Sd!bH!aBu`vgJ z3B@W;RPme;e3BK>sdw-B|Ni|G@Q{8q-$t6^nl+_M8kcV}N{#4w+)Hr!VUU)XC?ymg znAEi#pPj*TVvd0oJSrUMov_sf&O9psJ^Mr&KDz6z?h+Q38jMOe zv!D^oai5pHKWp%imR8vp_5JI|1O#Ok>LB86h2)!a2R?TmQx@JTlPOTg&dKW9!diV^ z?`HKG0XBqR3aN(2O3FqA@=kM__G#K8y*UFNZ7b4J*B1HU*u1y`^Vz0nCyOT<(*6o0 zlveP^_IuZ?Z#h;>Cq#n=fQ78Cl0Dv>WxE5QN2ZY~C^|0VuA#-?q6wU05V5Wz0LWpE zAP&s6wCm}GWjyFl(vxxOeHIuzq4~tbgW|%BR$qhZ$j9{;Yhn)9%PiO}XT@rsAzd1= zr^sZF$%O_-vADWp!MOn=6RZ+K4rJT=duDZfm{ON1o4ntPBeJ9lYLM7i=E5?mk2ihq z)On9Qo%`)+2;ftzLSFEOya3PXPW<{gVn;I+ctFT2>jgsfRCA1b+V9=z9ijdi=O=Vw z%CLNZQPx*Ok^RuQAOXPi9tgL`zATlsW|O8L8fLa_^INoW(?;C~tZ+%>I5{({&>uLf zk59%?j^uc~#-QW+%cK=-b+bOTMI3H!(lhY6tNbwvVfI-f&(mia=cUgalr((a+%w(e zV`@ljsMPC~+eW`h#Je_Buk-$LDg2ofx8Cx!@}a)sD{E_#gjeiyGvyJloZddqXJyX` zu0tdGeMLVlT`q1)nxN>TqPQb8_ntNf1{9oaN{;(o5?1uQYCd3*R>QwymUyKVPJUKD zY*Vk%hT#d*MkPTf2H^1uuhE$(XD4A={{YSrQj?=RLi?+~a2;GPVEtj%KZ8ft_if1O z)44XS{dpeAgEyVchKAbPPxv+*JDS9VPs3-KWu{-Lz+0Qjws;$}_bp>|Nwm+g9UHd!{>0WEkCLQ9z!~6Y{zd;Nf36d;tfBaaZQ4++)#iP$tIAx0eaxYAq z-2d^7*W^rC&!h~y!`~#D3SnAV45$x0U>UUMn!wIz-o92V&cTb~i8#5M^23TC1E}PBRw=Kt@O%Kc1hZaBW zF!gSUL?N--SsOJ|KL?uG*@*yaCxIpl0~7pJNTEd_>FDRnyN+Ao&aB|&Jl6()=Dec-?)ABQVeNh3Sg-80ECGy_a zD_iN-=&55)=+Yw*%}WCx`*b2x6@}l2VnuLHnO0lMf-*y|V;8MXzuQ~S zecOAa9s@t|eDPlLM!bqBEwrnxVU|#(3m}jvjE%ZWmwecmH?v?v3e|9Fpr^ zV^J5(cF<1jKRwY-S$K0HdtDaKlemc(BHDacdP+x;cb>Tkhi)Gps_So+luX+fZep#9 z**@ADEWQ14?-KL;m8}p?1Osa6#HPjg0ma_}1f6Qr%~qjj!hUW;w^0;V6_Y$?rbM^g zCOjkuf{@x25(t-D&%KNG$z|_4SGUBi`uw8{vqV2sXse`gO$+<@VvX8Wv$ZZED-JGF z+!>wwtS0$H)CPRZ%-h#(*d|X->dv~K6eMJKWJ>Pt%nvsu6NPkf0Fu-d0@LGUoI38R zh9jJx1W|AsfZhH}IsQR|+9;{jg{`coU=tBJef%OzX}WS9sFq0VFOLn?mD+?_+gEJ) z3)VisUZud1Xh^NT5R49q@}PuRbLJm05pAMX!D;BJFy?zWX9!^DS~s`s|7uw*31vsq z=h$}(9*%%Th%ou{g?`ynj7EFFJsbz}a;=QQ_T(UlQnuWMzk_MBfdVYZ6H5NUxvTLN z@uZOZ|A#nd+3r+6QL|PSLz#9A8n^`&U9zMXrLF_NcU;_;*ueM7=e(6(#|6 z4oqgG@Ppk4r2S+-d3nZHIpKZoDL!Q?fAYN%eLiPhL3I^f*pKSZnLe=__mGcSR zMyEp7J_yqwDq+WD1sNE2;x47T!g85*1u`4~ib1SC#x5h0i~soG_~^GB3%NTXa3RYd z(g2WE9jfgF7&-$5vZ%^H6;C`NO;~|VagtU3`<~9V5@frEQw(gj=4Z6FDF5e>1s%B! zLCi`p3Jx}psEEJ59@-pE_r}`VRiJH19-=xGc5{EclEQnG$UNbXR}uR^@d$_i4XF6% zALK$3gbVu4GqnxgJ=<>n*)oU&i-7(ZIG8!|V^m3j2dPs1FnO;+b%U}gXC|y0{&;;! zF}zg_bj$Lm2F9LYtu+g`56FrJBMKG)H8SD94F<3!`r~ylTF${9J?&IXf^M*|W%0vb zg~B3xciZxRFVRUEw{NSATS$%p_Yj9^+dtdwN>BcWNw_;f2r&_b<`Bgm2SZ8|2wc|! z#{Ta{kqUE^2{hz@N0MRC%~Ku=E`P*-0S-XlOJGy1gu@CihT+=tS3@gq1(^?l zR1jF=BXGL3zo3oyH?o&oEQ8o}QXfGLEKH(nP?So4N*_Qe0RJdxtBis#r(Z!q?(N%5 z(A4C(UuQM_Fu0-}n3TMl8`%(%5J}5~gvTc(i%y%?bG0_rf8Pg&i>s#@V`k0YN5@$6 zTv*nuff%}YY^b7E&TeX?+{8D}cvM!T3NHhC^4&>UxW9E`?yiY5<>L+*xxdl<;)(O` z-XS(iOTQaR#(6Ss-*PP1`v<>)0E};(2m`WAT{k`IplZ^qbz^O5;WyS_G1O-n*2t;GyXqg@%`GZ$L;9OMgJXcxknX#H8w>?%* z61MZ@@O;m-7GB{El~AGi5n%7wSKFzxU2QjN%&jpAGha`b7;{+-UdH z9xhO4W%JK>_1{(v^}YR*+4db;Ll>f5TNnie`!UasUW=a5-Mf(q+N{DR9xl+eQaI|x z0k^SSbAPYT37&Q@GR~Q^z!QJ#`Td{#&epi2sTCqf1hSN% zhj)kFDCGSrU!~SIH(sCnJY|Y;zMS!OA;s-e?OkLn)SPpiXQ1XczQPIBNA|WUfPf(n zPd)TYXtt`Ue!N2bSboTyWVP_a1p*qIm)n|0W~8pwIad-Jx;Ah36pYd%mA+{3nkCDb zGc{9mOHV=#fv`Dz>bTgqsqua*k0DmdPd_Tj*U-?K~UX+jD`UySQ=c8BaT<{nrQgzAAd8%+06EbN}8w|3a_i(azpU2={Zn ze`v!=p1=As)4r#3o7RT}FIo>U1=9J4m@L31R)(F)eDQQXC@)>zMh2-l#VlaRdO>+p z?Fn+2&_Ex$8Z?}r!y$4R93O2foNYnPn8^SRsdVl3}#xv*^^xb>Tc?|1#EZTGy9DFtP?>FSp$ z>r9`EC+;eb)#`cuPRdKdsIqz;raj^{^o(9dq`at&u$8Sev?{IqqN*5GQu&STb*9|K%un#^|Wk$dOGxH#fyRC5>xYE`t`>2}^yq!hajH>~nzXv|JD&`DhK;OxT9%#2=*JZW zPmc%Z8d3s?D9>W$Nb62Zf|M?|bA$zZ@_m}7d!_nSJW zll9haJ)5ewwi&fBewgB#m?{-gb)i`u(aqCvYI#7Dc>n&9oFN*)2VL0xSi28MEEhD+ z^`;=I`sGdA9d<}p3t8M*s<`KpvVB*MQ`)DS2X0{R8NBLQa@q6kX=TMd1FubG<=EId zDrVTXsyK6|C#{d?92`oH-nhu}_H>h{NaA3FmA6Q0Z?}Onr;LL`akF*d)1CSb+f$1$ zj?uH7hPg~Gfz@5*+9tE^?G4{UU*z8o4_AnUZpe`K;y3Ia^Xh5uDBdj}i0RY)<{RA&x4O+cI z4rSo2Bx0=kec!L}_H7uXOok@XBw6)u%pmUd4FDxbWV@dKoL~81PTln00S9NfjEab4 zdH;LfdM)3FqZ$@0+$cKb`kmbu>k?(=>sUvciEXlCu}HtF@a>D9QFB#HOwHXR3KpIr zf)dqkInEKv)nl)Y9C-kc`ALl>C3>-hg`MCLVMG;;$85fN`h0O*ATK=yN4@hR_eAAG zbth-ln2p?Um{EJ+px!LAIZK(}Kczwre(5~s8NsIO;5hVgq1OVMR=e0+HpO|{Dp)!p zkFa!+AkWf0(Y6X!;_AW1?ehYHq*5?ab-u^v<7U`{ILFstV=;R6NO@I|{^D8d;cT&W zjtERRv$`WB5_883xpf9DIL6Q-h{1ctO62R{Nod9J6kB8gDeV08i3bP<3FRS!rhGe- z{9RV$;SF952d5T}J{ATd6dlyNiaVZ*lKuf205X)rZJ5f%#ihDFDz~#?)**8EhdO*LStJ7#3{1Z?G}&N=NZg**2|{Yp)oZQ^oqU7ldS%=FU)J8A z*XcBABOu6QZa-l9+atf<#$p6P3#GzhpUKGKQ8w@n8`Fdy2bxe}?Foi(9|TXk1|oKF z^1--rWVn}}3xt!d8JcJ=b@g$7`1?P;aKp5|)&`B~1W0E@)zs9?jI0-}D0}tp)A6m_ z%6R6dR1HU`$SihZE86@Gg`eY)V0sd_l9*IE8%8j%3w0xxG_mgSv>Bda!jIkM3w_Qe z^bMR0_9&9GTh)EF*!%Y{b?W#~#fol(NapAMy!$wgAhKW`bZQgb)3@LO$)E z2bHN1%9E}pxHug5r}Xe$QI4}Xhs)GX2T0h*ZsSO=V$;-F#yx0@rjR9{9Fu#6y)#$ zA$b|S4a-6!K{)eCom9wyi(E#Oy!EjoZ=>Uj9=JQA+XTzknVVue|jIyl;CF!(4 z?(Ys+-RgWGGzV9}$tL#>0kT2rQUFFoD5{bYcd)m|$?=E()e(KUj(6eU;J6GONO~iF zUMX2CA`^y~EKHq)PqL2Y{!jb4AzCIIyoP9i@jU^hF&JOJHO-t>+u1{rcj&ksK^o(SOAq1mJD9G2qnGAywSDO#R z1+@-N)K!6hU*fy61`D%c#}_pSzbJ z_e^^NaB;Pj*x=|Mz(9@kmGGXUCcn8y@gYZ15_jP-4lmneS;E zhkPGCWQ>eJ2PiILt&C!uDs0#)EW%@Wfm3BU(QiF9#b-4=lJvyIZT62TKfgY=qeC=A z`tILds@_0?p@99*vB=t|@JHJJWcI;I109{_h!?Thd!?S|NEqLCd+l2qHV+JHHCt~T z%abR}oH+>&YdhN2T`#hjoDqi-1!NcDOXM2!Kep46z9;#jc1vR>Ufzxqj->COv)uOG zw0F!XKB;(cR0)Qarp4d7coKBO#@m}&4)=d@A|ZazyHmTP(eD1^1warbeMcGdtf_K# zmx>1c{gGs+b#oiC?aY;yxY>W1!^hnAoH+V(f6wKKXvd3Amj%?OCV$7OO+IvLTXoVT z2-2t5#x0jnpP+d}7)}boAsm}+lJ5q8Q2}Jdg@T=J5C}KKZ{dBRI)#^QRtz=neXAQ~ z1ox>ZSt+CYyXW!>PKsN>^`=THQ`X8*X{1Yv=cCpCYM%uo*C##-mca!sd zlbQ7`-xVDns-p0Z_xyaq$#Ma-! zRhO|KbA``F_3E)9Q-ciI4Sr=I0yCtIB9i}bZ$!eXV;`(4%H;zh?>s7J+jL9cNNe+E z<d>7YXlfs^U$NuKX+;w^M39Z0NGV% zdMXOGLaIUWBF_^8$CfW!=Im>F=~B(AkY}~5TD2*rRhb&TexKcX^nT_#*Q}b_eif|9 z=7|%FzmNDOJ5@SAEJ=>9REASfP(=-EA8CJ3sYhgHZ7x`Z`r{17(SR|T3>O?UOOHHx zhSP8z%*{-%>mut!#a-Jw{P1Vd&z~=wPn$J|8@@5g-#)Go71vI`TGJRKbL;JsuwfHH zjm^#X_wOlwZrsa_wLbhTDmeJfjOpP&jz4U$Jjo)Wso#aX*-4x4$-oh@qQA*>Lpa#= zO}CQx3NYbts~-wOn^XJ-3*GMQtG}x*wlY1#<++x(^mWJ%@#N3ih+!2N6&mFifnuI8 zbt)L?#D0sSyUP3Y4%}A09(CtVou-YMgr)Y;x+nXGT+%$!_qoC@&HhE=qe&SbTKv~t z@Tv;_8S! z+YJ-cNbE?X?l68L#UD2A2jqYPTonQ?J}^+>zyRCALq*%nchn^K*SRW+iMhA^{BGJE zaryG#avA;fNPT@fJKy-{#>FLU;HSo#uEo@~1wha~m7m}FNlDU~xduXOZyem4cA@b8 zk=L)^%(waa^XClNELK*(8xF}U+b>>B4w|dyz=^9nc~d^~b4*V&YQ}O2i2(D12CbFGoq z?_VdJk5A>mlxxcRD48mk2KKVEM!%Ksp@zEuShq(k(9}M`{Dq_Q0-IVXqv*nrJ+0EN zBFoKBPMjg7-#9u$Drxr{uBo~$)lcfmm#`$IzPGeI(bv_LJiy!5nsnikWoAsN%`UN$AcOM#Zp`t?EMR+^i`=g$`+d*b>F4jC z`(udzdA1nBTm>#%kHIs%_uHc98CI`%Dk$`%-npY(f6y&^CV!;R+~mr1+{QW4iK~;( zCAPNK#J`JEHC^`g=lA5s;o%yUQ}84-OM|smcda<|7zmFCRUUjQLMBa3eMX~fEn6|n z2%n{_tN;3CINI9@_gOfFSNxf!L6XeBwBW}_Vy=bjDlP8fnqF&m#Ng)5U0TiE5y#|X zLZlUlG>m!k<^bb!E;gaci47v&oH-8K;_dU2JcJfTo`sd?E!6rlSU->p+0&%Tz`z4^ z6``N4ev?_;zhU)1T4fBbXhZr&`0eU#*FODKS_Ny+81Us)L1YrGyyLmnKgWPEselPJ zF~~(K-K7)NR?g|-<3$BtGO&>dw~+}tX|)C`^qLR<7+6ap6c~Aice$mVA9wu=Y&N~D z%`xs-0hfh`u)@JyKT>MAk=v3ssXSm}58#r=N%-+MG>H&l%#xQ(B6UTwR7b=xN8CML}(eKfmrC=ONM6exK zyys4vcDa5pl(+{U&n201u!m&(JRD}bp59Ke+xBSTs2HwZ;>7iVASjY!Jcu^TPg-HOHl>X`(g4|#k zfFU^6Kg{vnQ_ilyFfw`O+k%2q)Iv?9;))0coekYvN|x{gNB8!Q~Ho= zbM6Dk>$)F{uLkC+gNY!rz=cZ{iRq#5H8X&du0hDp&ImHV{ZqrPdP)e$o*#P5E_F!g zKhjF2L1M?pp^km0W-bWdcC$4DXoU_2XffMP&JN_!48kb=18Cn>aE^vxJ=c0F3MB}; zBi}(@mb;TnMMc+4;$!>8VU409txRjbuxZ9Ald})s8ygl_oa7*3BV!(Vzh7t3R<#5* zg)0O0@75!Ef%%e~JIfg%6Qi%c)D^c)Hcn)>-!&OhfX#(TnTtiK6_f+-8Nw!r zUP1(9E6(z$Kkd^G{1{cjD#lae_kb0t$H%2;HGBAJRzks%pVN|7Oy5@m{zA@f`^ zEoI8As7$F45i*_YY47j(eb4##{CC#cul?F~TEpk_4EKFM*Zsa`cNlft6FGL~cg2Cn zwx=E&NL2KlDH3DdCpxF0Hb_0-;P82Jyxj3#^iDO5K>54>o|sIR{~D9s^(=dlZq}7z zKue=_@Ho?5#}VNU@ut@sS4wHTaAfH?Em|+tA^EPAn%>dc@!s(a)rgtiNu_pM?%NLs zm|b;lJ{q2>S={RHdzQn*XMDJ*;kr|U#PIvdeNIj>&VBM9r>yGMZQBp?g0ED_-B1dG z>X~IX#kQM*V~Wt>QajRcWiKO_hmsXHGkc*_&8C*&zSJUkVstV035$C}|VU!tLu$uFCag~OCYpYbxf zUU#*yx9ddAcPm^L&$QbHJW9*e?VDCy8e_QTPrH7rY7`2_TJf;rO`NoTNn6p#Sm4Iw z;Ig~r-jVQaLo4DKy7MB0^CNe!#5OwA5K-g&O?9w!VQ#WC!{RKBxm4@dCeFeDdU_h; zD;mE%Csx9v!O^e(b6Cc`L*XoMR_jDdK3I@DCeUB~gg;{r7>mW4SnmxutNY$7JhKJ| z7>o+FWm|+3-k>kqqi&-~HN!`gnU0K44ju^Ju5DSr1)k4>RbSgAs)gdk-|XCclIY-M z#!KihLtj@_y~L?BdbA*Tdxd_Ipo;;L$iAt+3P1)Y$TyF>Jr6A7!k#OxUaUdLw{dP- zzjE_Q+=jb+G^5>h{ZABqW4*#2-gEwD4!&f?7TkK%Lh4L3 zcL+Yt6t$8&*4TK=-89j4E}KCyD=#vp?1d%G*qCFQg_=NL@m_~9c85_mQH!;i&O$$F zHnR@VadE3@L~#0PYEJHxW<9DBDjO0L6!cu*?$Lfn1yP&7n@XgCB4=QweG^ql6%tff2lVq!&KvLOEB*!%_RGmb=_K&w%G^Y+cj&#zXh z?3|eJd9*Fl^GbUkh7e0t7^T})F6VEHjW1i6;1wOR+X!)(mD%Ic>fN`y#P%Unki|VoLF~v zmKIBTJ)_(74h$-ncmZ;ym@TD+>h-^bBh# zOS6>@UDwxHl%?%=9`HAtbzl`dA6b(@H~N-}B5D;Y`ADiUg1SC3u2VA~3b}7z1)R$3 zPJI0P`(PfHD@f{i$$RVsw?0A^_QU3*r}E6Vn;NHXahT1VdUGoDLjdM{Op4Q7nZ2X+l>NvZIgd3{qe4z&zed+h zeHTz{2?(sf?7|ND3I;gj=$~|~VAplu%CtGFRVV73piFyv%i>;X19vSe=HSFcook8Z zPgFD_YDVnOyJ6^=yPKLO9I<+rQ1?y2oB zbQ-juPu|(JU7pFM&Dugm>vx~B_iUz_bEUWrB+$NM%+{|jWe7FEnh$pad>#w&bYfVb* zIKXFG?s>}fCoM(l0#)z@_Yu{`57uqwb505hMo~D_%>Me4ZltQGq%iH$slsgbSm_h) zQ)6A+!A3^D7WD(tw%tcFOz>R|@m=d9RVrOf9@`jpPx|OsYTVy(`DGpbmlh?*1BH%` z9&g#ADi*&RRJ=Nr=lqSwzGNz6zn&fz+)%G(Cnrk|hsK)HvY+u12d$eB7Z&gco+@sD%dxg+x zm|~%JASEsMg8dr>qYc@)croyuln>~s$) z;x>=BtvqVxW@_3n%tu{`rHqEK}TW`oYh%t)U>Hl5br0&FrdxY=0h94Jox>k!=ZfK%gnw}f5zNw&*%HZDOX$yn>jX;~?$hbmkFHs1je-r;)rqNgoK z4NX2D20{K~*E!!Oii3Sl!_DRB^CF~m#LR*Thhx)W+h)($2xjD@a zJIj%062Hb*L!@r(c2KYea6B%qgnKMIN(=D&@kg8+=RU8K9)2XV(&M0xP9R!XKYsK9 z6bq+xtBJh`IoyG&@)KISG7$ve6DCXEmpk=fR^j)~Z{oPX8V@LU?)an&H$C=!|JdB3 z_Alvk)n|FJzy9Xsp^R+#i*|?N5wAI=C8I*|Oh;Z(+XoCv9=|tz`LOkGC;{4@)mEau zM0>u8;^4f}Fn`sZZS`f}`)elO)v0*p*$B++D7{jj_2QhbzzLthdK#8{hZr6N3iLUd zNJ~q9g6ax@wT-`L#)A(3#$)B*e~q?ibiPgh@fjVq?*@lgzj?Si=R8g8Gv3l?dA4-g+LtW4bAdagb^jH< zYU8O>Q9{?Tb*<9ZHyP2M+i>h?i0acwVP7nE%0c4yj%h?tpIN^3Szb7j?_JHnY7B?o z3j1=<6L7MrG%Z-WX7U4_5a_J7pMMc^K_GOM*WIG-x%=>i-`d@L%69B!i(8N)&u0st zQg!`YW=@kYYrQQv#AV zVQ6>fuZFZ@kUKw|cEk2?`mMunnn!Ud#>&lVsQYFDe@}(RoRYgjJ{&s~`&-5a8zZx2<_*^C4Zgio69gBUWPU9_?g3mXZ#G}o-C9QHfx#C)Cx(z3l zwZn4K%4cH6PMUHxrLr{TrsjNRWM6Z-;BmS8W+p9xh`we)Ro1;5;@2_Wdve`SweCyc zXphCKUk{`t6o$X*nhYFe-JE^-1q0{8$kR8aE_LRE#kBP}1VP8>DoQJWF$8uDB}mYg zznLOvLqo+^Kqmi_r?^YtjB%K@df=OL;@06gN%=McLIJx@?~E|*vKUyq*=Mtmxz{Fp zd#g}>uDH+Wb4H|e#bdF^qeDg-!E(hldU09oWJXax96mUq*`y{i@<6G3_VVn-%bI?x zCY`z^M=YeU4YS|nQ>m=&I8*;)(sSMi;`Qt36uf0@{1fLk+MLs`*0uDz%sH|qUi|1Y z#_sD|;lIA3;gO>V(`MfJZS3Y3gTpsjz4`I;?vtUVrIxL(W2af$4)R{rdd7ys&jzXx z7xnv;)yFZTsMF2ZP0=;T_v+Ol@30S3IF31XtdNscj?Ha&+lOn#M}n!+A3it_FU;Eu z41ap!9m9PjVRZ*QgKTWhmT6<XokQkdop4q$ zD%?m&e*1k{1HqjAu93zk%NMwWx@cS0mYevdSzfRgnX?!*tV0>-OM}zEnx-dy3zKUwig%CBDmNf?W(CzwlHw*+6pVt6fMQc zt2b`EX-mv?cOD9e{@GG-jIy?gPAEPk)Hzdvo$%iyu>*`C!9BdjtIADq4WZukluv7~Yvo>DXI zDL+%L?q!I6-ek92TcO*5m&T%tC)Msz9U=>zEZW{!UL9; z-I8-DqjS*eXJ^*>uP^#(*Tq$uo1GfYjnF=#BRbQ&JW#wV{}`K~tSrhZsC_ItvIV_= zaeglg%LlQcYp2CjSGHr{$%4B)IrRmV7fkV<={;O=yOa(eU3Xp2W|yeV{lJU%?do;_ zzuk@e{{Ht5#gv`%3r?38KHr`kf9Yo->#59t>?@yVW5Yn5_88)b!{X*QdGcPIo9-uy zw4sOZYjDKLt9J7Kr{0RQXD7!8%vt$jFS-G>Vb_M#$|s{NESiOh=6#94Ws3&(5A0_S z^lnwQ>s#eMMTzk?_E)!T6J_2e>{MM)M0L0~9DDDf2qxA;qhcREjRdsq>dwh;lTc=p zXt^B_%IV*hN@riXa$%`>?BvoQ z$>p2|E^ce$)%tfQW}K!ccK2z4mAxpslr+Ds#dO50^NS){}T*E>p@8xYM zkx@4(Yf>EdmN11}c_wD&(_vgclM$E6idc;R%76!k9@8}N9TU^$M->O6YF>Bj9bTJu z?b`QK@0_?jzp4HlYT0j>?940UIZyR4eqlM1@w~w7#?%@$h1C_GD#KQAb34z!dNt6! zv{cbsS{VH3X6?1}@^Aazb6Q(7rLQk{Nq=vmlP>5azsGvTrSaWCn@7@5r+gEho*$1I zK3tb_mWP{>(I8#_ik%`(*0;-t&vDg9ypucjtygZORMvEb=*772>()nHzTmZ><^9>5 zE6!B!``ivG-Lm42jC;C_G$y^WtcGQ7S?@*MF4rXujtZ}5U!ISZbUDu|!+-w9UgKiE zR9V_6mNMFe4u{5=il?#EL;TInMP_`h6ni8LFtH=f52WSC*nNaI{2kw`We^o)%})E@ ztS#7MOWP1#xA^S1mseOIAg7lz4;}Kc?a4OQ5ECl2GO7#6&DC*VznuZ3IOoDgGKmi_ zx(n&mPV1#nmgbHW?Lgu)rklc*-Zg?^0P|FLXKlY>?m!Z1GVYdzMYKl6w2ktq%Oy_3 z34u{jIk$?7wsZ`phgTP9WJg=9i%!kx1}&QB?V2ffBA4~=(p+MW~NYi zeSNF6COu;sZRx~thLs*;g>M=K^fmR2OB|KF%d?yg{r(hmp@S+Zv`t1eF>22l8u7-A z7ZR}S(lJRq7r4ren@n^dyhwW}1;N~6lB;@U)O0}YCM9L+X@i$93(vfA`E zGHm6$9RX)4#HM$nJ6=3}ztGvO+JOAd^X&tgQ~A`sE*F>V@fO}WnKLr7qVeII4APBc zr09?Dljhu8xz&44i7l_P@`Ze`7~97nQk;NR{e_q&NQ=VpE{yV?Yz0^Jiz=-X6KY@I zyuGE);h>Y#;;_#1U6oUx`v=wf-fH;&u&~i+47!^4{QkrG*Hd!twSO@x)=kXx4N8?w z_}leG3TxA#AjI=HSUya5M_1>8!%95mXZA{`hZye=6Z@V1u_L3NV^UMor?}YTSZ~qF zGz0FM8w{bR#q6RPm}T9Lmz{aXSifOy*y9Twkxs$eg!;;1*+gYv<+?kk8MEv1Q*8d; z37IUkD3V0}vZ&{Ym9*QHF zbum}3<)tF((~)<+`e!1fljGtC=YNftPqRD|42%wW_oI5ieB>;g!4D*=cEzSXUjIWG zWq@6A*{^t(za%YKwW%7)wS4P1X16;>ac8!vt?-9ycf-shUeEN__v8k(6}1#fko?Cc z?PTcOY{Mh*@69dR`Sw+7QwnI!H>A}BTijHukGx#Ghe^xI@$04^zrPjLbSo*)(k49p zI@*&h7_RsBfwqB^W5QHJx}L_+s)?WPA}%kSv~#rU4SyK3+X$QjXD7{c1Mc*$bB%yO zW#fpxQli&G!rm!*AR;yOb*OwtTe^^(6Em~(<^Hd_tky9P1nK;IRn^wR6?ThEK{ z{dz2hLDo}ny(Ey3WuMt5V-ETXX0o?*d`8)E4--?7!gm_h{rfIw2lD2>xGVPULdRDT znV1u|gR)NPV};wYOwY`da{fb^aL4&RV{(f}_@cd9ToTQX*2XFQ7M8%}X2brpvgxha zMrLlIwf;BMs>1UUy}ulv>s|h#I3k9|83r{Du|uxVl_aoqb(~RafNh_DB~(y4mTDw2Y|mg!0BpMR#+ErX~aeKLtrYir;sm?u@Iaixq ze{JMSA2t-y!`Y7i4SJUmuKJ=_1rgy1R2Ft98%g2#T{UN6V8sw!C6VgswES&tF!3W~ z;tM6y_F-Dc;7E#0*%jCJ;VXj?;_^`xrqGYpmE;1D&y=`g`O9uEzIR2Y=lD~ZVP}p< z#75F@DrhUN*#e{8spWkop#V^+8fu!L{ZEVacuN=oOji#M1>C%~;Yh*#?JquW037R| z$4;3o1~#cHL^=LqiZD;iqrCvQ6Y23Lt$JvC zB#usQLs924(J%XXLlBa=zn$RkLGuPj&AbLFX%(vK>aTHJ`sv%Zhk1=~c?P5chonN) zwKr=wMYQcMuvqy&zi*Us`uY!aA01Uu@x6WfcCQzziln1;RqS0Lgps`D;EWf}f~eso z{GgTE@ow<*Pp3Pt0=oXrs=FM>5i3x`H~co!Z~VgU#;CJ5=&Q(2mDhSZ)NjqYKJ|4K4pP!~mJcj9X*Mc9SMspluXZLv=?4%lB9^CBL7`5N|87;AlwR;~} zprow4rK-a#5AS)O5tYvBbcejp4fG{P2upqMyUxjnjH(ulX#@z0o-<$un|r*0FoOM1 z)kh~{9)&u6qMW9NMin%q#3S(hEw{8LXn>9bpM(}g7;%;z&3MFI9qFCK&S6%saN`}e7Kbzc4-+n@Xp&0CNd#Lr7njvl;o z+n?y@nHU%v#^Hk!swnF0l%v^bJA8?v+j{vL$R1!y>OK#q{yOQF?_E7T58r#c2>`-f8Q`rcv)S#EDZlckgy5RCkqVi0q_QYP=zu~ zP!1qSKb#}ql0nOZpy~fP2vbqSC~zI$EGevZnATM0c}4c;8wm#& zmugVjLeWa<;sM|f{x0mX?2^9Oqqz)4W4xSnwL5Czmmn7fhXH?%Uq6yJ$hd94`Hx&B zv`a^f^yFvsWG3KEBX@-k=Yq1J*<2Ee<|;IO+} zw3M$xn^6I%47j;Rj~?B5+(x>zT-s9;Ow%VU4-*r4!L)_AjbYOz!}68+!7TmKelLFyWdz5UTY7&N(08O=7!l!v;M{E0A<) z;o8=4>3)UefxGrw9)6!t#3*e!1fqrv_C%ZG)AyQ~y)&)~}8Z>Lx zU6>l~p#3JcXPn7Em+o^|+J~p6rm{&pCN!JXYif>rqrbcr>T;YL<+lL`)&^oj#M7rw zA&|eS$@Kt&Vh zQr6%pTO;qb8ia^G0uMF?FOTP8C)_WqXu^t)nyr4|fRFpP^!%w`+vMDJXqx6*>VpXn zVeS)f$~yKX6aKwr7y>CcIp3^eW1zCUB*a^1g)*_bh5YSQ|4KhR0 z=DJ*R0brHRZxRy@4%RF~0dD2Gdbh)vFm9 z_J-K!zE*trunk%-3L&kO%znNN37Lve|M2h)K(`3GutNqg?!_#N`Ypt1kCM=nSzcb= z>$FZ0(pA7hZ{d~EQfJl`859uk5}%QH+GCL#`sOYmAV8cWOwG*TMQV{5n4iCw@Fvjw zN1?#($R8&`Pv8-7o$Zv1c-gW)q0*yoR~gXjAg{%eqlh>Nj}#|J2vW`&jGa16urs&P<> z4vDBDzY7A11S^lleAE2$+m`$Z#9TnMh74Q1vTtlRF8LpT))(5KX~`j+rsTvtPMGl>xu zUwHT?U=`!xLPD70%^9&!EG|EARX{Pcfc-9)K)UNii8eTEv7j|9R z7f;R$yqx47oRK$SOgofgTD3L@9y-__ue$_cFcZ51n7E2!fW5HQ*qMy$Yxd*g%kyyH z!RI5|MA-TfY`XiPvG=0$wHr48!(|6R+XqnL?mXMuq|K(WV;v)74ODSrm%bc_ml<)! z551a(!q3Jma6^H5Sw;LHVIeTlbuMa<7ik*61NR#2AeR{Y0t=rjN9u$^BKLR2L!!L zr(sQa3zt=7S#2JWINh1G*&p`ONuKh=<^>xLyISNffw4^0mH%vyX%W{vDB*vW&w~5R zN&K{<@5KTZhsLKZ78)_+N`}-wyMniVcAKrwz1;JM*R? zeJ_Q|>#V1$XZo@Gt;%SFbBGkoG!nzLVdiigO#$vTcf-QMiC7b+C|E1S#_(12z)5)P zTEcUm@zS4vvMsIW4hX!fCv%;}>FNgZ&!Kj8VKUO}|BSS3V)88`6ORSTYMubMkrgGj zCDV}>W`u^Jd+d6^89@JTQ|c!yHU4mrc=BiqL-gK96|t;K5)^8Zj(M+Fb^WF(qU)I{}lGTXEc5FXtkgjQ>LIm}{Eu$qDO> zM-LlST(Wri`MJf!vMMG9W+YB;%CaJc5xUQGnr{zdBL}j#px0FudJZhEkMxEnkM_LqbUCuQGWAa>4VGNNt2K2J&u>m`8b(u zzKT7Gbj%`}$;@n8ytR#;oj{iZ*`(Iu`ys}xSO4tSCOf+dc+tgRF98v!B-? zh(8O$9Zb}&VY7YtE#0`34Ixs)PaZ#?gm;kD|>o+&WlmH%FEFh`x+|7W`yMhMMcC@4u>oD2uZdPv{)`E?EBe8VSoc6;6$EHwdr@LXag#htH|p;Ho!147zEpUM5}t( zG^i~t+J!|$&(K8s3>WCjBkskY2&;hH(M1%Zi||?aKdDWQkwfl7OdpWvAF#osS}Rg! zckQ{(&aA;HX$=hxT{y>T!M~bHrK;kK37&XQ$TNs{_0ir>?d>&2D11x~@YDJWMEQ=J zh@+tDWcd9d_TB$xHFIA3@E@FD&?<-R??K3@Qw_6gbX@Qs zWB-$VB9i~-@qeeK|L=dGV|iZkK@RroUGNr#7H zWn2Lr@_H`84Oj?}u-`SqVxE~dmq0Px!+}OU8?#Jz71G9xJ zQ_x9~@CY5*>j7s_li3cx6bumJ?>K(>-^qlq*07ZmB|0tED$Z-T);dGzOJ$xJ?>ln* z`0*q&sJ@kcwUmtPjMw$7gx*36st$kyD@?dy=X;0tkg&6d7d2bqA za~Ube(d8g3*Vx2?OMs`yY_6l>785YySRmh~$Z(#?pM>@{r7SGL($|Dfw7vBdsGKAfjdT{5?gp zgSq%Zu*D!~B=?b!5ljg&7DOOTK{ay)`ToQ!2`D@$D(7>CiKhWJDkWrlOziA|NQP)B z@L{Lr;NU<=sSx$t36?md1s4Ob5|ogSD5|>HAju-(1FrLR0(5I>5kFH-2PX#N=s`F> zB!q%#t=_jcjGTIt)iwHWX3C~RKfI?wIOSF!C@2# zU%~oe*iNoVNlBUg=*QS1m_fmBti@|wg||(feczqh+S(j5?DVV9uq(4SeR3^@6op_i z=+|LDn$pe8$S@-_0~SU#*o~Q3Sp!P1%nXfQe`;oDcL)FZ3eAxuV=Q(R!9u12M+rw47s3c284BK;hJsfBUXi$z zmMs1L78UajECIg`X*ltrMJvB=kK1XP_v2s;bd^w}u}pqMDoh5w{L-u$=0F9!u~k%5 zI!mwYB``hU=44}GHJF1gF>FH!PugfMjBSaQ;l;nzEO2AXxIi0yxfm2u)| zqBBMz{Uv}4Eqs?6t)>r-Iw&wQGWylz&ZBsks9%PwwW=PBa9Bo)y<0*IO`%}lvUWH$ z1@jmd3N$o}HvMmfiK$17+*V=^gP<60)Z59(LXP)0yjn3?96g_-#&YR7EtVUq!sM~oA5%8GiGmP~ zmO}i${D!Xp3;`g9#At?II@@#O(g#+(X>mHJy8!>d0;*htvwbqwkbcm*&i9qcq10oM zsg65Csr~B}{{L2CWAe32_gxDiOLq>^z<9K>kxcpJ<#h3u2Dx06+SO~Y%-lfSiRE1K zLOX*G!?rXOUV+5EjZNj8djswtgYi>bB#I9f!{Oh#0}`SSK`L>Ra zIavQied~D8jBi7-S58Su$#+?R023=`yu>xQY+@j1%XJU{9RkFcb#?TRabGPh^^CN@ zy!lT#2r0m89C3(8NV|@){~GU$onax-N#e#os|?fNOQDRCtBvyW3>aTmQMM*E5KQe) zm_EcAj>Nw9fbfh*+3KWP58{fO5xDBG;+w!f{Wbl zQ(t^{8x>W&d&i6`eLX#07~9Boh@p3Yb%NbRV{6zweXL@gu&c_m`)~{U!DD<0;(17J zAIL4AaL2KES7Fx$D>-VkFiG>#EtEM(g7KnJUgD%B%83N~0Sv#z>oZ}%hQg(0v2+m_B18*9imuDT7dA=OPC@_U1&dy(lZ=TZ=Rl3S5r};q5M-G zd2bShLPV0TiL{IiH=Lu%L_?By4Z2e<{D`Q+?N`@1F=Anf%%?BoPmsV8M75fv!z}sp wh%bjsqCa(7S6FeM8QKvN%QdYLnc@M7e~}`4*&oF literal 0 HcmV?d00001