From eb25d3dfcbe942271643a677a77d4122227d0ed5 Mon Sep 17 00:00:00 2001
From: yingziyu-llt <yingziyu-Lin@outlook.com>
Date: Wed, 25 Feb 2026 22:54:26 +0800
Subject: [PATCH 1/8] update

---
 .gitignore                         |   4 +
 05_seq_align/generate_benchmark.py |  90 ++++++++
 05_seq_align/main.cu               | 342 +++++++++++++++++++++++++++++
 3 files changed, 436 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 05_seq_align/generate_benchmark.py
 create mode 100644 05_seq_align/main.cu

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d746c6d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+*.txt
+*.fastq
+*.fasta
+main
\ No newline at end of file
diff --git a/05_seq_align/generate_benchmark.py b/05_seq_align/generate_benchmark.py
new file mode 100644
index 0000000..47333e8
--- /dev/null
+++ b/05_seq_align/generate_benchmark.py
@@ -0,0 +1,90 @@
+import random
+import os
+
+NUM_CHROMOSOMES = 2
+CHROM_LENGTH = 50000
+NUM_READS = 1000
+READ_LENGTH = 100
+MUTATION_RATE = 0.05
+UNKNOWN_RATE = 0.1
+
+BASES = ['A', 'C', 'G', 'T']
+QUAL_HIGH = 'I'
+QUAL_LOW = '!'
+
+def generate_fasta(filename):
+    chromosomes = {}
+    with open(filename, 'w') as f:
+        for i in range(1, NUM_CHROMOSOMES + 1):
+            chrom_name = f"chr{i}"
+            f.write(f">{chrom_name}\n")
+            
+            seq = []
+            chunk_size = 10000
+            for _ in range(0, CHROM_LENGTH, chunk_size):
+                chunk = ''.join(random.choices(BASES, k=min(chunk_size, CHROM_LENGTH - len(seq))))
+                f.write(chunk + "\n")
+                seq.append(chunk)
+                
+            chromosomes[chrom_name] = ''.join(seq)
+    return chromosomes
+
+def mutate_sequence(seq):
+    mutated = list(seq)
+    quals = [QUAL_HIGH] * len(seq)
+    
+    # Track the actual start shift if deletions happen at the very beginning
+    actual_start_offset = 0 
+    
+    for i in range(len(mutated)):
+        if random.random() < MUTATION_RATE:
+            op = random.choice(['sub', 'ins', 'del'])
+            if op == 'sub':
+                mutated[i] = random.choice([b for b in BASES if b != mutated[i]])
+                quals[i] = QUAL_LOW
+            elif op == 'ins':
+                mutated[i] = mutated[i] + random.choice(BASES)
+                quals[i] = QUAL_LOW
+            elif op == 'del':
+                mutated[i] = ''
+                quals[i] = ''
+                if i == 0: 
+                    actual_start_offset += 1 # If first base is deleted, the real start shifts
+                
+    return ''.join(mutated)[:READ_LENGTH], ''.join(quals)[:READ_LENGTH], actual_start_offset
+
+def generate_fastq_and_truth(chromosomes, fastq_file, truth_file):
+    chrom_names = list(chromosomes.keys())
+    
+    with open(fastq_file, 'w') as fq, open(truth_file, 'w') as tr:
+        for i in range(1, NUM_READS + 1):
+            read_name = f"read{i}"
+            
+            if random.random() < UNKNOWN_RATE:
+                seq = ''.join(random.choices(BASES, k=READ_LENGTH))
+                qual = QUAL_HIGH * READ_LENGTH
+                tr.write(f"{read_name} unknown_origin\n")
+            else:
+                chrom = random.choice(chrom_names)
+                start_pos = random.randint(0, CHROM_LENGTH - READ_LENGTH)
+                origin_seq = chromosomes[chrom][start_pos:start_pos + READ_LENGTH + 10] # Pad for deletions
+                
+                seq, qual, offset = mutate_sequence(origin_seq)
+                
+                while len(seq) < READ_LENGTH:
+                    seq += random.choice(BASES)
+                    qual += QUAL_HIGH
+                    
+                final_pos = start_pos + offset
+                tr.write(f"{read_name} {chrom} {final_pos}\n")
+                
+            fq.write(f"@{read_name}\n{seq}\n+\n{qual}\n")
+
+if __name__ == "__main__":
+    print("Generating reference genome...")
+    chroms = generate_fasta("reference.fasta")
+    
+    print("Generating reads and ground truth...")
+    generate_fastq_and_truth(chroms, "reads.fastq", "truth.txt")
+    
+    print("Benchmark data generated successfully.")
\ No newline at end of file
diff --git a/05_seq_align/main.cu b/05_seq_align/main.cu
new file mode 100644
index 0000000..fa875ca
--- /dev/null
+++ b/05_seq_align/main.cu
@@ -0,0 +1,342 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <algorithm>
+#include <omp.h>
+#include <cuda_runtime.h>
+
+#define MATCH_SCORE 2
+#define MISMATCH_SCORE -1
+#define GAP_PENALTY -1
+#define MAX_READ_LEN 256
+#define KMER_SIZE 16
+
+struct Read {
+    std::string name;
+    std::string seq;
+};
+
+void load_reference(const std::string& filename, std::vector<std::string>& names, std::string& concat_seq, std::vector<size_t>& offsets) {
+    std::ifstream file(filename);
+    if (!file.is_open()) exit(1);
+    std::string line, name, seq;
+    bool is_fastq = false;
+
+    if (std::getline(file, line)) {
+        if (line[0] == '@') is_fastq = true;
+        name = line.substr(1);
+    }
+
+    while (std::getline(file, line)) {
+        if (is_fastq) {
+            seq = line;
+            std::getline(file, line); 
+            std::getline(file, line); 
+            names.push_back(name);
+            offsets.push_back(concat_seq.size());
+            concat_seq += seq;
+            if (std::getline(file, line)) name = line.substr(1);
+        } else {
+            if (line[0] == '>') {
+                names.push_back(name);
+                offsets.push_back(concat_seq.size());
+                concat_seq += seq;
+                name = line.substr(1);
+                seq = "";
+            } else {
+                seq += line;
+            }
+        }
+    }
+    if (!is_fastq && !name.empty()) {
+        names.push_back(name);
+        offsets.push_back(concat_seq.size());
+        concat_seq += seq;
+    }
+}
+
+void load_reads(const std::string& filename, std::vector<Read>& reads) {
+    std::ifstream file(filename);
+    if (!file.is_open()) exit(1);
+    std::string line;
+    while (std::getline(file, line)) {
+        if (line.empty()) continue;
+        Read r;
+        r.name = line.substr(1);
+        std::getline(file, r.seq);
+        std::getline(file, line); 
+        std::getline(file, line); 
+        reads.push_back(r);
+    }
+}
+
+inline uint8_t char2val_cpu(char c) {
+    return (c >> 1) & 0x03;
+}
+
+inline uint32_t pack_16mer(const char* seq) {
+    uint32_t packed = 0;
+    for (int i = 0; i < 16; ++i) {
+        uint8_t val = char2val_cpu(seq[i]);
+        packed |= (val << (30 - 2 * i));
+    }
+    return packed;
+}
+
+void encode_sequence_cpu(const std::string& seq, uint32_t* buffer) {
+    size_t length = seq.length();
+    size_t encoded_size = (length + 15) / 16;
+    
+    #pragma omp parallel for
+    for (size_t i = 0; i < encoded_size; ++i) {
+        uint32_t packed = 0;
+        size_t start = i * 16;
+        for (int j = 0; j < 16; ++j) {
+            if (start + j < length) {
+                uint8_t val = char2val_cpu(seq[start + j]);
+                packed |= (val << (30 - 2 * j));
+            }
+        }
+        buffer[i] = packed;
+    }
+}
+
+__device__ inline uint8_t get_base(const uint32_t* packed_array, size_t index) {
+    size_t array_idx = index / 16;
+    int bit_pos = 30 - 2 * (index % 16);
+    return (packed_array[array_idx] >> bit_pos) & 0x03;
+}
+
+__global__ void sw_extend_batched_kernel(
+    const uint32_t* ref_packed, size_t ref_len,
+    const uint32_t* reads_packed, const int* read_lengths,
+    const int* candidate_positions, const int* candidate_read_indices,
+    int total_candidates,
+    int* out_scores, int* out_best_pos)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_candidates) return;
+
+    int ref_start = candidate_positions[tid];
+    int read_idx = candidate_read_indices[tid];
+    int read_len = read_lengths[read_idx];
+    
+    const uint32_t* my_read_packed = reads_packed + read_idx * (MAX_READ_LEN / 16);
+
+    // Give a larger window (100 bp extra) to accommodate high indel rates
+    int ref_region_len = read_len + 100; 
+    if (ref_start + ref_region_len > ref_len) {
+        ref_region_len = ref_len - ref_start;
+    }
+
+    int H_row[MAX_READ_LEN];
+    int Start_row[MAX_READ_LEN];
+    for (int i = 0; i < read_len; ++i) {
+        H_row[i] = 0;
+        Start_row[i] = -1;
+    }
+
+    int max_score = 0;
+    int best_ref_start = ref_start;
+
+    for (int i = 0; i < ref_region_len; ++i) {
+        int current_ref_pos = ref_start + i;
+        uint8_t ref_base = get_base(ref_packed, current_ref_pos);
+        
+        int h_diag = 0; 
+        int start_diag = current_ref_pos; 
+        int h_left = 0;
+        int start_left = -1;
+
+        for (int j = 0; j < read_len; ++j) {
+            uint8_t read_base = get_base(my_read_packed, j);
+            int match = (ref_base == read_base) ? MATCH_SCORE : MISMATCH_SCORE;
+            
+            int score_diag = h_diag + match;
+            int score_up = H_row[j] + GAP_PENALTY;
+            int score_left = h_left + GAP_PENALTY;
+            
+            int score = 0;
+            int start = current_ref_pos;
+            
+            if (score_diag > 0 && score_diag >= score_up && score_diag >= score_left) {
+                score = score_diag;
+                // If previous score was 0, alignment restarts HERE, update start pos
+                start = (h_diag == 0) ? current_ref_pos : start_diag; 
+            } else if (score_up > 0 && score_up >= score_left) {
+                score = score_up;
+                start = (H_row[j] == 0) ? current_ref_pos : Start_row[j];
+            } else if (score_left > 0) {
+                score = score_left;
+                start = (h_left == 0) ? current_ref_pos : start_left;
+            }
+
+            h_diag = H_row[j];
+            start_diag = Start_row[j];
+            
+            H_row[j] = score;  
+            Start_row[j] = start;
+            
+            h_left = score;    
+            start_left = start;
+
+            if (score > max_score) {
+                max_score = score;
+                best_ref_start = start;
+            }
+        }
+    }
+
+    out_scores[tid] = max_score;
+    out_best_pos[tid] = best_ref_start; 
+}
+
+int main() {
+    std::string ref_file = "reference.fasta";
+    std::string reads_file = "reads.fastq";
+    std::ofstream out_file("results.txt");
+
+    std::vector<std::string> ref_names;
+    std::vector<size_t> ref_offsets;
+    std::string concat_ref;
+
+    load_reference(ref_file, ref_names, concat_ref, ref_offsets);
+    
+    std::vector<Read> reads;
+    load_reads(reads_file, reads);
+
+    std::unordered_map<uint32_t, std::vector<int>> kmer_index;
+    // CRITICAL FIX: Stride = 1 guarantees we index EVERY possible 16-mer in the reference
+    for (size_t i = 0; i + KMER_SIZE <= concat_ref.length(); i += 1) { 
+        uint32_t kmer = pack_16mer(concat_ref.c_str() + i);
+        if (kmer_index[kmer].size() < 100) { 
+            kmer_index[kmer].push_back(i);
+        }
+    }
+
+    size_t ref_encoded_size = (concat_ref.length() + 15) / 16;
+    uint32_t* h_ref_packed = new uint32_t[ref_encoded_size];
+    encode_sequence_cpu(concat_ref, h_ref_packed);
+
+    uint32_t* d_ref_packed;
+    cudaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t));
+    cudaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), cudaMemcpyHostToDevice);
+
+    int num_reads = reads.size();
+    std::vector<uint32_t> h_reads_packed(num_reads * (MAX_READ_LEN / 16), 0);
+    std::vector<int> h_read_lengths(num_reads, 0);
+    
+    std::vector<int> h_candidate_pos;
+    std::vector<int> h_candidate_read_idx;
+
+    for (int i = 0; i < num_reads; ++i) {
+        int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN);
+        h_read_lengths[i] = rlen;
+        
+        uint32_t* read_ptr = h_reads_packed.data() + i * (MAX_READ_LEN / 16);
+        encode_sequence_cpu(reads[i].seq, read_ptr);
+
+        std::vector<int> cands;
+        // CRITICAL FIX: Stride = 2 guarantees dense seed checking against the reference
+        for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) {
+            uint32_t seed = pack_16mer(reads[i].seq.c_str() + offset);
+            auto it = kmer_index.find(seed);
+            if (it != kmer_index.end()) {
+                for (int pos : it->second) {
+                    // Start looking slightly before the expected position
+                    int ref_window_start = std::max(0, pos - offset - 50);
+                    cands.push_back(ref_window_start);
+                }
+            }
+        }
+        
+        std::sort(cands.begin(), cands.end());
+        int last_added = -1000;
+        for(int pos : cands) {
+            // De-duplicate overlapping windows
+            if (pos > last_added + 50) { 
+                h_candidate_pos.push_back(pos);
+                h_candidate_read_idx.push_back(i);
+                last_added = pos;
+            }
+        }
+    }
+
+    int total_candidates = h_candidate_pos.size();
+    if (total_candidates == 0) {
+        for (const auto& r : reads) out_file << r.name << " unknown_origin\n";
+        return 0;
+    }
+
+    uint32_t* d_reads_packed;
+    int *d_read_lengths, *d_candidate_pos, *d_candidate_read_idx, *d_out_scores, *d_out_best_pos;
+
+    cudaMalloc(&d_reads_packed, h_reads_packed.size() * sizeof(uint32_t));
+    cudaMalloc(&d_read_lengths, num_reads * sizeof(int));
+    cudaMalloc(&d_candidate_pos, total_candidates * sizeof(int));
+    cudaMalloc(&d_candidate_read_idx, total_candidates * sizeof(int));
+    cudaMalloc(&d_out_scores, total_candidates * sizeof(int));
+    cudaMalloc(&d_out_best_pos, total_candidates * sizeof(int));
+
+    cudaMemcpy(d_reads_packed, h_reads_packed.data(), h_reads_packed.size() * sizeof(uint32_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_read_lengths, h_read_lengths.data(), num_reads * sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_candidate_pos, h_candidate_pos.data(), total_candidates * sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_candidate_read_idx, h_candidate_read_idx.data(), total_candidates * sizeof(int), cudaMemcpyHostToDevice);
+
+    int blockSize = 256;
+    int numBlocks = (total_candidates + blockSize - 1) / blockSize;
+    
+    sw_extend_batched_kernel<<<numBlocks, blockSize>>>(
+        d_ref_packed, concat_ref.length(),
+        d_reads_packed, d_read_lengths,
+        d_candidate_pos, d_candidate_read_idx,
+        total_candidates,
+        d_out_scores, d_out_best_pos
+    );
+    cudaDeviceSynchronize();
+
+    std::vector<int> h_out_scores(total_candidates);
+    std::vector<int> h_out_best_pos(total_candidates);
+    cudaMemcpy(h_out_scores.data(), d_out_scores, total_candidates * sizeof(int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_out_best_pos.data(), d_out_best_pos, total_candidates * sizeof(int), cudaMemcpyDeviceToHost);
+
+    std::vector<int> best_score_per_read(num_reads, -1);
+    std::vector<int> best_pos_per_read(num_reads, -1);
+
+    for (int i = 0; i < total_candidates; ++i) {
+        int r_idx = h_candidate_read_idx[i];
+        if (h_out_scores[i] > best_score_per_read[r_idx]) {
+            best_score_per_read[r_idx] = h_out_scores[i];
+            best_pos_per_read[r_idx] = h_out_best_pos[i];
+        }
+    }
+
+    for (int i = 0; i < num_reads; ++i) {
+        // Threshold: Need at least 50% length equivalent score (~25 perfect matches minimum)
+        if (best_score_per_read[i] > h_read_lengths[i] * 0.5) { 
+            int global_pos = best_pos_per_read[i];
+            auto it = std::upper_bound(ref_offsets.begin(), ref_offsets.end(), global_pos);
+            int ref_idx = std::distance(ref_offsets.begin(), it) - 1;
+            int local_pos = global_pos - ref_offsets[ref_idx];
+            
+            // Output format strictly matches ground truth for clean diff
+            out_file << reads[i].name << " " << ref_names[ref_idx] << " " << local_pos << "\n";
+        } else {
+            out_file << reads[i].name << " unknown_origin\n";
+        }
+    }
+
+    cudaFree(d_ref_packed);
+    cudaFree(d_reads_packed);
+    cudaFree(d_read_lengths);
+    cudaFree(d_candidate_pos);
+    cudaFree(d_candidate_read_idx);
+    cudaFree(d_out_scores);
+    cudaFree(d_out_best_pos);
+    delete[] h_ref_packed;
+
+    return 0;
+}
\ No newline at end of file

From 709eb38471dd308e9d155d4a300cab6c02fb0ae0 Mon Sep 17 00:00:00 2001
From: yingziyu-llt <yingziyu-Lin@outlook.com>
Date: Mon, 2 Mar 2026 23:11:19 +0800
Subject: [PATCH 2/8] update: remove unordered_map

---
 05_seq_align/main.cu | 189 ++++++++++++++++++++++---------------------
 1 file changed, 98 insertions(+), 91 deletions(-)

diff --git a/05_seq_align/main.cu b/05_seq_align/main.cu
index fa875ca..d9578dd 100644
--- a/05_seq_align/main.cu
+++ b/05_seq_align/main.cu
@@ -1,12 +1,11 @@
-#include <iostream>
-#include <fstream>
-#include <vector>
+#include <parallel/algorithm>
 #include <cstdint>
-#include <string>
-#include <unordered_map>
-#include <algorithm>
-#include <omp.h>
 #include <cuda_runtime.h>
+#include <fstream>
+#include <iostream>
+#include <omp.h>
+#include <string>
+#include <vector>
 
 #define MATCH_SCORE 2
 #define MISMATCH_SCORE -1
@@ -19,26 +18,39 @@ struct Read {
     std::string seq;
 };
 
-void load_reference(const std::string& filename, std::vector<std::string>& names, std::string& concat_seq, std::vector<size_t>& offsets) {
+struct KmerPos {
+    uint32_t kmer;
+    int pos;
+    bool operator<(const KmerPos& other) const {
+        if (kmer != other.kmer) return kmer < other.kmer;
+        return pos < other.pos;
+    }
+};
+
+void load_reference(const std::string &filename, std::vector<std::string> &names, std::string &concat_seq,
+                    std::vector<size_t> &offsets) {
     std::ifstream file(filename);
-    if (!file.is_open()) exit(1);
+    if (!file.is_open())
+        exit(1);
     std::string line, name, seq;
     bool is_fastq = false;
 
     if (std::getline(file, line)) {
-        if (line[0] == '@') is_fastq = true;
+        if (line[0] == '@')
+            is_fastq = true;
         name = line.substr(1);
     }
 
     while (std::getline(file, line)) {
         if (is_fastq) {
             seq = line;
-            std::getline(file, line); 
-            std::getline(file, line); 
+            std::getline(file, line);
+            std::getline(file, line);
             names.push_back(name);
             offsets.push_back(concat_seq.size());
             concat_seq += seq;
-            if (std::getline(file, line)) name = line.substr(1);
+            if (std::getline(file, line))
+                name = line.substr(1);
         } else {
             if (line[0] == '>') {
                 names.push_back(name);
@@ -58,26 +70,26 @@ void load_reference(const std::string& filename, std::vector<std::string>& names
     }
 }
 
-void load_reads(const std::string& filename, std::vector<Read>& reads) {
+void load_reads(const std::string &filename, std::vector<Read> &reads) {
     std::ifstream file(filename);
-    if (!file.is_open()) exit(1);
+    if (!file.is_open())
+        exit(1);
     std::string line;
     while (std::getline(file, line)) {
-        if (line.empty()) continue;
+        if (line.empty())
+            continue;
         Read r;
         r.name = line.substr(1);
         std::getline(file, r.seq);
-        std::getline(file, line); 
-        std::getline(file, line); 
+        std::getline(file, line);
+        std::getline(file, line);
         reads.push_back(r);
     }
 }
 
-inline uint8_t char2val_cpu(char c) {
-    return (c >> 1) & 0x03;
-}
+inline uint8_t char2val_cpu(char c) { return (c >> 1) & 0x03; }
 
-inline uint32_t pack_16mer(const char* seq) {
+inline uint32_t pack_16mer(const char *seq) {
     uint32_t packed = 0;
     for (int i = 0; i < 16; ++i) {
         uint8_t val = char2val_cpu(seq[i]);
@@ -86,48 +98,44 @@ inline uint32_t pack_16mer(const char* seq) {
     return packed;
 }
 
-void encode_sequence_cpu(const std::string& seq, uint32_t* buffer) {
+void encode_sequence_cpu(const std::string &seq, uint32_t *buffer) {
     size_t length = seq.length();
     size_t encoded_size = (length + 15) / 16;
-    
-    #pragma omp parallel for
+#pragma omp parallel for
     for (size_t i = 0; i < encoded_size; ++i) {
         uint32_t packed = 0;
         size_t start = i * 16;
         for (int j = 0; j < 16; ++j) {
             if (start + j < length) {
                 uint8_t val = char2val_cpu(seq[start + j]);
-                packed |= (val << (30 - 2 * j));
+                packed |= (uint32_t)(val << (30 - 2 * j));
             }
         }
         buffer[i] = packed;
     }
 }
 
-__device__ inline uint8_t get_base(const uint32_t* packed_array, size_t index) {
+__device__ inline uint8_t get_base(const uint32_t *packed_array, size_t index) {
     size_t array_idx = index / 16;
     int bit_pos = 30 - 2 * (index % 16);
     return (packed_array[array_idx] >> bit_pos) & 0x03;
 }
 
-__global__ void sw_extend_batched_kernel(
-    const uint32_t* ref_packed, size_t ref_len,
-    const uint32_t* reads_packed, const int* read_lengths,
-    const int* candidate_positions, const int* candidate_read_indices,
-    int total_candidates,
-    int* out_scores, int* out_best_pos)
-{
+__global__ void sw_extend_batched_kernel(const uint32_t *ref_packed, size_t ref_len, const uint32_t *reads_packed,
+                                         const int *read_lengths, const int *candidate_positions,
+                                         const int *candidate_read_indices, int total_candidates, int *out_scores,
+                                         int *out_best_pos) {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total_candidates) return;
+    if (tid >= total_candidates)
+        return;
 
     int ref_start = candidate_positions[tid];
     int read_idx = candidate_read_indices[tid];
     int read_len = read_lengths[read_idx];
-    
-    const uint32_t* my_read_packed = reads_packed + read_idx * (MAX_READ_LEN / 16);
 
-    // Give a larger window (100 bp extra) to accommodate high indel rates
-    int ref_region_len = read_len + 100; 
+    const uint32_t *my_read_packed = reads_packed + read_idx * (MAX_READ_LEN / 16);
+
+    int ref_region_len = read_len + 100;
     if (ref_start + ref_region_len > ref_len) {
         ref_region_len = ref_len - ref_start;
     }
@@ -145,27 +153,26 @@ __global__ void sw_extend_batched_kernel(
     for (int i = 0; i < ref_region_len; ++i) {
         int current_ref_pos = ref_start + i;
         uint8_t ref_base = get_base(ref_packed, current_ref_pos);
-        
-        int h_diag = 0; 
-        int start_diag = current_ref_pos; 
+
+        int h_diag = 0;
+        int start_diag = current_ref_pos;
         int h_left = 0;
         int start_left = -1;
 
         for (int j = 0; j < read_len; ++j) {
             uint8_t read_base = get_base(my_read_packed, j);
             int match = (ref_base == read_base) ? MATCH_SCORE : MISMATCH_SCORE;
-            
+
             int score_diag = h_diag + match;
             int score_up = H_row[j] + GAP_PENALTY;
             int score_left = h_left + GAP_PENALTY;
-            
+
             int score = 0;
             int start = current_ref_pos;
-            
+
             if (score_diag > 0 && score_diag >= score_up && score_diag >= score_left) {
                 score = score_diag;
-                // If previous score was 0, alignment restarts HERE, update start pos
-                start = (h_diag == 0) ? current_ref_pos : start_diag; 
+                start = (h_diag == 0) ? current_ref_pos : start_diag;
             } else if (score_up > 0 && score_up >= score_left) {
                 score = score_up;
                 start = (H_row[j] == 0) ? current_ref_pos : Start_row[j];
@@ -176,11 +183,11 @@ __global__ void sw_extend_batched_kernel(
 
             h_diag = H_row[j];
             start_diag = Start_row[j];
-            
-            H_row[j] = score;  
+
+            H_row[j] = score;
             Start_row[j] = start;
-            
-            h_left = score;    
+
+            h_left = score;
             start_left = start;
 
             if (score > max_score) {
@@ -191,7 +198,7 @@ __global__ void sw_extend_batched_kernel(
     }
 
     out_scores[tid] = max_score;
-    out_best_pos[tid] = best_ref_start; 
+    out_best_pos[tid] = best_ref_start;
 }
 
 int main() {
@@ -204,60 +211,64 @@ int main() {
     std::string concat_ref;
 
     load_reference(ref_file, ref_names, concat_ref, ref_offsets);
-    
+
     std::vector<Read> reads;
     load_reads(reads_file, reads);
 
-    std::unordered_map<uint32_t, std::vector<int>> kmer_index;
-    // CRITICAL FIX: Stride = 1 guarantees we index EVERY possible 16-mer in the reference
-    for (size_t i = 0; i + KMER_SIZE <= concat_ref.length(); i += 1) { 
-        uint32_t kmer = pack_16mer(concat_ref.c_str() + i);
-        if (kmer_index[kmer].size() < 100) { 
-            kmer_index[kmer].push_back(i);
+    // Build kmer index with array and sort
+    std::vector<KmerPos> kmer_list;
+    if (concat_ref.length() >= KMER_SIZE) {
+        kmer_list.reserve(concat_ref.length() - KMER_SIZE + 1);
+        for (size_t i = 0; i + KMER_SIZE <= concat_ref.length(); i += 1) {
+            kmer_list.push_back({pack_16mer(concat_ref.c_str() + i), (int)i});
         }
+        std::sort(kmer_list.begin(), kmer_list.end());
     }
 
     size_t ref_encoded_size = (concat_ref.length() + 15) / 16;
-    uint32_t* h_ref_packed = new uint32_t[ref_encoded_size];
+    uint32_t *h_ref_packed = new uint32_t[ref_encoded_size];
     encode_sequence_cpu(concat_ref, h_ref_packed);
 
-    uint32_t* d_ref_packed;
+    uint32_t *d_ref_packed;
     cudaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t));
     cudaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), cudaMemcpyHostToDevice);
 
     int num_reads = reads.size();
     std::vector<uint32_t> h_reads_packed(num_reads * (MAX_READ_LEN / 16), 0);
     std::vector<int> h_read_lengths(num_reads, 0);
-    
+
     std::vector<int> h_candidate_pos;
     std::vector<int> h_candidate_read_idx;
 
     for (int i = 0; i < num_reads; ++i) {
         int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN);
         h_read_lengths[i] = rlen;
-        
-        uint32_t* read_ptr = h_reads_packed.data() + i * (MAX_READ_LEN / 16);
+
+        uint32_t *read_ptr = h_reads_packed.data() + i * (MAX_READ_LEN / 16);
         encode_sequence_cpu(reads[i].seq, read_ptr);
 
         std::vector<int> cands;
-        // CRITICAL FIX: Stride = 2 guarantees dense seed checking against the reference
+        
+        // Find seeds using binary search
         for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) {
             uint32_t seed = pack_16mer(reads[i].seq.c_str() + offset);
-            auto it = kmer_index.find(seed);
-            if (it != kmer_index.end()) {
-                for (int pos : it->second) {
-                    // Start looking slightly before the expected position
-                    int ref_window_start = std::max(0, pos - offset - 50);
-                    cands.push_back(ref_window_start);
-                }
+            KmerPos target = {seed, 0};
+            auto it = std::lower_bound(kmer_list.begin(), kmer_list.end(), target);
+            
+            int count = 0;
+            // Limit to max 100 hits per kmer to mimic previous behavior
+            while (it != kmer_list.end() && it->kmer == seed && count < 100) {
+                int ref_window_start = std::max(0, it->pos - offset - 50);
+                cands.push_back(ref_window_start);
+                ++it;
+                ++count;
             }
         }
-        
+
         std::sort(cands.begin(), cands.end());
         int last_added = -1000;
-        for(int pos : cands) {
-            // De-duplicate overlapping windows
-            if (pos > last_added + 50) { 
+        for (int pos : cands) {
+            if (pos > last_added + 50) {
                 h_candidate_pos.push_back(pos);
                 h_candidate_read_idx.push_back(i);
                 last_added = pos;
@@ -267,11 +278,12 @@ int main() {
 
     int total_candidates = h_candidate_pos.size();
     if (total_candidates == 0) {
-        for (const auto& r : reads) out_file << r.name << " unknown_origin\n";
+        for (const auto &r : reads)
+            out_file << r.name << " unknown_origin\n";
         return 0;
     }
 
-    uint32_t* d_reads_packed;
+    uint32_t *d_reads_packed;
     int *d_read_lengths, *d_candidate_pos, *d_candidate_read_idx, *d_out_scores, *d_out_best_pos;
 
     cudaMalloc(&d_reads_packed, h_reads_packed.size() * sizeof(uint32_t));
@@ -284,18 +296,15 @@ int main() {
     cudaMemcpy(d_reads_packed, h_reads_packed.data(), h_reads_packed.size() * sizeof(uint32_t), cudaMemcpyHostToDevice);
     cudaMemcpy(d_read_lengths, h_read_lengths.data(), num_reads * sizeof(int), cudaMemcpyHostToDevice);
     cudaMemcpy(d_candidate_pos, h_candidate_pos.data(), total_candidates * sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_candidate_read_idx, h_candidate_read_idx.data(), total_candidates * sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_candidate_read_idx, h_candidate_read_idx.data(), total_candidates * sizeof(int),
+               cudaMemcpyHostToDevice);
 
     int blockSize = 256;
     int numBlocks = (total_candidates + blockSize - 1) / blockSize;
-    
-    sw_extend_batched_kernel<<<numBlocks, blockSize>>>(
-        d_ref_packed, concat_ref.length(),
-        d_reads_packed, d_read_lengths,
-        d_candidate_pos, d_candidate_read_idx,
-        total_candidates,
-        d_out_scores, d_out_best_pos
-    );
+
+    sw_extend_batched_kernel<<<numBlocks, blockSize>>>(d_ref_packed, concat_ref.length(), d_reads_packed,
+                                                       d_read_lengths, d_candidate_pos, d_candidate_read_idx,
+                                                       total_candidates, d_out_scores, d_out_best_pos);
     cudaDeviceSynchronize();
 
     std::vector<int> h_out_scores(total_candidates);
@@ -315,14 +324,12 @@ int main() {
     }
 
     for (int i = 0; i < num_reads; ++i) {
-        // Threshold: Need at least 50% length equivalent score (~25 perfect matches minimum)
-        if (best_score_per_read[i] > h_read_lengths[i] * 0.5) { 
+        if (best_score_per_read[i] > h_read_lengths[i] * 0.5) {
             int global_pos = best_pos_per_read[i];
             auto it = std::upper_bound(ref_offsets.begin(), ref_offsets.end(), global_pos);
             int ref_idx = std::distance(ref_offsets.begin(), it) - 1;
             int local_pos = global_pos - ref_offsets[ref_idx];
-            
-            // Output format strictly matches ground truth for clean diff
+
             out_file << reads[i].name << " " << ref_names[ref_idx] << " " << local_pos << "\n";
         } else {
             out_file << reads[i].name << " unknown_origin\n";

From e5e39e7b68093df942fd441d5f3de5c2243ac627 Mon Sep 17 00:00:00 2001
From: yingziyu-llt <yingziyu-Lin@outlook.com>
Date: Mon, 2 Mar 2026 23:12:07 +0800
Subject: [PATCH 3/8] update: better parallel

---
 05_seq_align/main.cu | 74 +++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 31 deletions(-)

diff --git a/05_seq_align/main.cu b/05_seq_align/main.cu
index d9578dd..4f3a7ae 100644
--- a/05_seq_align/main.cu
+++ b/05_seq_align/main.cu
@@ -222,7 +222,7 @@ int main() {
         for (size_t i = 0; i + KMER_SIZE <= concat_ref.length(); i += 1) {
             kmer_list.push_back({pack_16mer(concat_ref.c_str() + i), (int)i});
         }
-        std::sort(kmer_list.begin(), kmer_list.end());
+        __gnu_parallel::sort(kmer_list.begin(), kmer_list.end());
     }
 
     size_t ref_encoded_size = (concat_ref.length() + 15) / 16;
@@ -240,40 +240,52 @@ int main() {
     std::vector<int> h_candidate_pos;
     std::vector<int> h_candidate_read_idx;
 
-    for (int i = 0; i < num_reads; ++i) {
-        int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN);
-        h_read_lengths[i] = rlen;
-
-        uint32_t *read_ptr = h_reads_packed.data() + i * (MAX_READ_LEN / 16);
-        encode_sequence_cpu(reads[i].seq, read_ptr);
-
-        std::vector<int> cands;
-        
-        // Find seeds using binary search
-        for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) {
-            uint32_t seed = pack_16mer(reads[i].seq.c_str() + offset);
-            KmerPos target = {seed, 0};
-            auto it = std::lower_bound(kmer_list.begin(), kmer_list.end(), target);
-            
-            int count = 0;
-            // Limit to max 100 hits per kmer to mimic previous behavior
-            while (it != kmer_list.end() && it->kmer == seed && count < 100) {
-                int ref_window_start = std::max(0, it->pos - offset - 50);
-                cands.push_back(ref_window_start);
-                ++it;
-                ++count;
+    // Parallelize seed matching
+    #pragma omp parallel
+    {
+        std::vector<int> local_cands_pos;
+        std::vector<int> local_cands_idx;
+
+        #pragma omp for schedule(dynamic, 100)
+        for (int i = 0; i < num_reads; ++i) {
+            int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN);
+            h_read_lengths[i] = rlen;
+
+            uint32_t *read_ptr = h_reads_packed.data() + i * (MAX_READ_LEN / 16);
+            encode_sequence_cpu(reads[i].seq, read_ptr);
+
+            std::vector<int> cands;
+            for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) {
+                uint32_t seed = pack_16mer(reads[i].seq.c_str() + offset);
+                KmerPos target = {seed, 0};
+                auto it = std::lower_bound(kmer_list.begin(), kmer_list.end(), target);
+                
+                int count = 0;
+                while (it != kmer_list.end() && it->kmer == seed && count < 100) {
+                    int ref_window_start = std::max(0, it->pos - offset - 50);
+                    cands.push_back(ref_window_start);
+                    ++it;
+                    ++count;
+                }
             }
-        }
 
-        std::sort(cands.begin(), cands.end());
-        int last_added = -1000;
-        for (int pos : cands) {
-            if (pos > last_added + 50) {
-                h_candidate_pos.push_back(pos);
-                h_candidate_read_idx.push_back(i);
-                last_added = pos;
+            std::sort(cands.begin(), cands.end());
+            int last_added = -1000;
+            for (int pos : cands) {
+                if (pos > last_added + 50) {
+                    local_cands_pos.push_back(pos);
+                    local_cands_idx.push_back(i);
+                    last_added = pos;
+                }
             }
         }
+
+        // Merge thread-local results
+        #pragma omp critical
+        {
+            h_candidate_pos.insert(h_candidate_pos.end(), local_cands_pos.begin(), local_cands_pos.end());
+            h_candidate_read_idx.insert(h_candidate_read_idx.end(), local_cands_idx.begin(), local_cands_idx.end());
+        }
     }
 
     int total_candidates = h_candidate_pos.size();

From 5b6380f4fb1ddcdcbd89ea836f0528982acbc95e Mon Sep 17 00:00:00 2001
From: yingziyu-llt <yingziyu-Lin@outlook.com>
Date: Mon, 2 Mar 2026 23:22:08 +0800
Subject: [PATCH 4/8] update: better io

---
 05_seq_align/main.cu | 138 +++++++++++++++++++++++++++++++------------
 1 file changed, 99 insertions(+), 39 deletions(-)

diff --git a/05_seq_align/main.cu b/05_seq_align/main.cu
index 4f3a7ae..960bce2 100644
--- a/05_seq_align/main.cu
+++ b/05_seq_align/main.cu
@@ -6,6 +6,12 @@
 #include <omp.h>
 #include <string>
 #include <vector>
+#include <cstring> // For memchr
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
 
 #define MATCH_SCORE 2
 #define MISMATCH_SCORE -1
@@ -29,64 +35,118 @@ struct KmerPos {
 
 void load_reference(const std::string &filename, std::vector<std::string> &names, std::string &concat_seq,
                     std::vector<size_t> &offsets) {
-    std::ifstream file(filename);
-    if (!file.is_open())
-        exit(1);
-    std::string line, name, seq;
-    bool is_fastq = false;
-
-    if (std::getline(file, line)) {
-        if (line[0] == '@')
-            is_fastq = true;
-        name = line.substr(1);
-    }
-
-    while (std::getline(file, line)) {
+    int fd = open(filename.c_str(), O_RDONLY);
+    if (fd == -1) exit(1);
+    
+    struct stat sb;
+    if (fstat(fd, &sb) == -1) exit(1);
+    size_t size = sb.st_size;
+    
+    const char* mapped_ptr = static_cast<const char*>(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0));
+    if (mapped_ptr == MAP_FAILED) exit(1);
+    close(fd);
+
+    concat_seq.reserve(size); 
+    
+    const char* ptr = mapped_ptr;
+    const char* end = ptr + size;
+    bool is_fastq = (ptr < end && *ptr == '@');
+
+    while (ptr < end) {
         if (is_fastq) {
-            seq = line;
-            std::getline(file, line);
-            std::getline(file, line);
+            const char* name_end = (const char*)memchr(ptr, '\n', end - ptr);
+            if (!name_end) name_end = end;
+            std::string name(ptr + 1, name_end - (ptr + 1));
+            if (!name.empty() && name.back() == '\r') name.pop_back();
+            ptr = name_end + (name_end < end ? 1 : 0);
+
+            if (ptr >= end) break;
+            const char* seq_end = (const char*)memchr(ptr, '\n', end - ptr);
+            if (!seq_end) seq_end = end;
+            std::string seq(ptr, seq_end - ptr);
+            if (!seq.empty() && seq.back() == '\r') seq.pop_back();
+            ptr = seq_end + (seq_end < end ? 1 : 0);
+
             names.push_back(name);
             offsets.push_back(concat_seq.size());
             concat_seq += seq;
-            if (std::getline(file, line))
-                name = line.substr(1);
+
+            ptr = (const char*)memchr(ptr, '\n', end - ptr);
+            if (ptr) ptr++; else break;
+            ptr = (const char*)memchr(ptr, '\n', end - ptr);
+            if (ptr) ptr++; else break;
         } else {
-            if (line[0] == '>') {
+            if (*ptr == '>') {
+                const char* name_end = (const char*)memchr(ptr, '\n', end - ptr);
+                if (!name_end) name_end = end;
+                std::string name(ptr + 1, name_end - (ptr + 1));
+                if (!name.empty() && name.back() == '\r') name.pop_back();
                 names.push_back(name);
                 offsets.push_back(concat_seq.size());
-                concat_seq += seq;
-                name = line.substr(1);
-                seq = "";
+                ptr = name_end + (name_end < end ? 1 : 0);
             } else {
-                seq += line;
+                const char* seq_end = (const char*)memchr(ptr, '\n', end - ptr);
+                if (!seq_end) seq_end = end;
+                size_t seq_len = seq_end - ptr;
+                if (seq_len > 0 && *(seq_end - 1) == '\r') seq_len--;
+                concat_seq.append(ptr, seq_len);
+                ptr = seq_end + (seq_end < end ? 1 : 0);
             }
         }
     }
-    if (!is_fastq && !name.empty()) {
-        names.push_back(name);
-        offsets.push_back(concat_seq.size());
-        concat_seq += seq;
-    }
+    munmap((void*)mapped_ptr, size);
 }
 
 void load_reads(const std::string &filename, std::vector<Read> &reads) {
-    std::ifstream file(filename);
-    if (!file.is_open())
-        exit(1);
-    std::string line;
-    while (std::getline(file, line)) {
-        if (line.empty())
+    int fd = open(filename.c_str(), O_RDONLY);
+    if (fd == -1) exit(1);
+    
+    struct stat sb;
+    if (fstat(fd, &sb) == -1) exit(1);
+    size_t size = sb.st_size;
+    
+    const char* mapped_ptr = static_cast<const char*>(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0));
+    if (mapped_ptr == MAP_FAILED) exit(1);
+    close(fd);
+
+    reads.reserve(size / 150); 
+
+    const char* ptr = mapped_ptr;
+    const char* end = ptr + size;
+
+    while (ptr < end) {
+        if (*ptr != '@') {
+            ptr = (const char*)memchr(ptr, '\n', end - ptr);
+            if (ptr) ptr++; else break;
             continue;
+        }
+
         Read r;
-        r.name = line.substr(1);
-        std::getline(file, r.seq);
-        std::getline(file, line);
-        std::getline(file, line);
-        reads.push_back(r);
+        const char* name_end = (const char*)memchr(ptr, '\n', end - ptr);
+        if (!name_end) name_end = end;
+        r.name.assign(ptr + 1, name_end - (ptr + 1));
+        if (!r.name.empty() && r.name.back() == '\r') r.name.pop_back();
+        ptr = name_end + (name_end < end ? 1 : 0);
+        
+        if (ptr >= end) break;
+
+        const char* seq_end = (const char*)memchr(ptr, '\n', end - ptr);
+        if (!seq_end) seq_end = end;
+        r.seq.assign(ptr, seq_end - ptr);
+        if (!r.seq.empty() && r.seq.back() == '\r') r.seq.pop_back();
+        ptr = seq_end + (seq_end < end ? 1 : 0);
+
+        ptr = (const char*)memchr(ptr, '\n', end - ptr);
+        if (ptr) ptr++; else break;
+        ptr = (const char*)memchr(ptr, '\n', end - ptr);
+        if (ptr) ptr++; else break;
+
+        reads.push_back(std::move(r));
     }
+    munmap((void*)mapped_ptr, size);
 }
 
+
 inline uint8_t char2val_cpu(char c) { return (c >> 1) & 0x03; }
 
 inline uint32_t pack_16mer(const char *seq) {

From f862f3e3bd48ad984c8a3c67fd9cc231cfa2c17b Mon Sep 17 00:00:00 2001
From: yingziyu-llt <yingziyu-Lin@outlook.com>
Date: Tue, 3 Mar 2026 17:37:58 +0800
Subject: [PATCH 5/8] better mp

---
 05_seq_align/main.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/05_seq_align/main.cu b/05_seq_align/main.cu
index 960bce2..a8502c7 100644
--- a/05_seq_align/main.cu
+++ b/05_seq_align/main.cu
@@ -262,6 +262,7 @@ __global__ void sw_extend_batched_kernel(const uint32_t *ref_packed, size_t ref_
 }
 
 int main() {
+    cudaFree(0); 
     std::string ref_file = "reference.fasta";
     std::string reads_file = "reads.fastq";
     std::ofstream out_file("results.txt");
@@ -329,7 +330,7 @@ int main() {
                 }
             }
 
-            std::sort(cands.begin(), cands.end());
+            __gnu_parallel::sort(cands.begin(), cands.end());
             int last_added = -1000;
             for (int pos : cands) {
                 if (pos > last_added + 50) {

From 480ee0e6e9404444e112545d0c4ad3a997e60a17 Mon Sep 17 00:00:00 2001
From: yingziyu-llt <yingziyu-Lin@outlook.com>
Date: Thu, 12 Mar 2026 16:56:14 +0800
Subject: [PATCH 6/8] update: wave-form dp

---
 05_seq_align/main.cu | 600 +++++++++++++++++++++++++++++--------------
 1 file changed, 410 insertions(+), 190 deletions(-)

diff --git a/05_seq_align/main.cu b/05_seq_align/main.cu
index a8502c7..6e933f6 100644
--- a/05_seq_align/main.cu
+++ b/05_seq_align/main.cu
@@ -1,152 +1,235 @@
-#include <parallel/algorithm>
+#include <chrono>
 #include <cstdint>
+#include <cstdio>
+#include <cstring>
 #include <cuda_runtime.h>
+#include <fcntl.h>
 #include <fstream>
 #include <iostream>
 #include <omp.h>
+#include <parallel/algorithm>
 #include <string>
-#include <vector>
-#include <cstring> // For memchr
-#include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <unistd.h>
+#include <vector>
+#ifdef USE_NVTX
+#include <nvToolsExt.h>
+#endif
+
+struct ScopedTimer {
+    const char *name;
+    std::chrono::high_resolution_clock::time_point t0;
+    ScopedTimer(const char *n) : name(n), t0(std::chrono::high_resolution_clock::now()) {
+#ifdef USE_NVTX
+        nvtxRangePushA(name);
+#endif
+    }
+    ~ScopedTimer() {
+        auto t1 = std::chrono::high_resolution_clock::now();
+        double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+        std::fprintf(stderr, "[TIMER][CPU] %s: %.3f ms\n", name, ms);
+#ifdef USE_NVTX
+        nvtxRangePop();
+#endif
+    }
+};
 
+struct ScopedCudaEventTimer {
+    const char *name;
+    cudaEvent_t ev0{}, ev1{};
+    ScopedCudaEventTimer(const char *n) : name(n) {
+#ifdef USE_NVTX
+        nvtxRangePushA(name);
+#endif
+        cudaEventCreate(&ev0);
+        cudaEventCreate(&ev1);
+        cudaEventRecord(ev0);
+    }
+    ~ScopedCudaEventTimer() {
+        cudaEventRecord(ev1);
+        cudaEventSynchronize(ev1);
+        float ms = 0.0f;
+        cudaEventElapsedTime(&ms, ev0, ev1);
+        std::fprintf(stderr, "[TIMER][GPU] %s: %.3f ms\n", name, ms);
+        cudaEventDestroy(ev0);
+        cudaEventDestroy(ev1);
+#ifdef USE_NVTX
+        nvtxRangePop();
+#endif
+    }
+};
 
 #define MATCH_SCORE 2
 #define MISMATCH_SCORE -1
 #define GAP_PENALTY -1
 #define MAX_READ_LEN 256
 #define KMER_SIZE 16
+constexpr int LOOKUP_PREFIX_BITS = 22;
+constexpr uint32_t LOOKUP_BUCKETS = (1u << LOOKUP_PREFIX_BITS);
+constexpr int LOOKUP_SHIFT = 32 - LOOKUP_PREFIX_BITS;
 
 struct Read {
     std::string name;
     std::string seq;
 };
 
+// Modified to hold read kmers instead of reference kmers
 struct KmerPos {
     uint32_t kmer;
-    int pos;
-    bool operator<(const KmerPos& other) const {
-        if (kmer != other.kmer) return kmer < other.kmer;
-        return pos < other.pos;
-    }
+    int read_id;
+    int offset;
+    bool operator<(const KmerPos &other) const { return kmer < other.kmer; }
 };
 
 void load_reference(const std::string &filename, std::vector<std::string> &names, std::string &concat_seq,
                     std::vector<size_t> &offsets) {
     int fd = open(filename.c_str(), O_RDONLY);
-    if (fd == -1) exit(1);
-    
+    if (fd == -1)
+        exit(1);
+
     struct stat sb;
-    if (fstat(fd, &sb) == -1) exit(1);
+    if (fstat(fd, &sb) == -1)
+        exit(1);
     size_t size = sb.st_size;
-    
-    const char* mapped_ptr = static_cast<const char*>(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0));
-    if (mapped_ptr == MAP_FAILED) exit(1);
+
+    const char *mapped_ptr = static_cast<const char *>(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0));
+    if (mapped_ptr == MAP_FAILED)
+        exit(1);
     close(fd);
 
-    concat_seq.reserve(size); 
-    
-    const char* ptr = mapped_ptr;
-    const char* end = ptr + size;
+    concat_seq.reserve(size);
+
+    const char *ptr = mapped_ptr;
+    const char *end = ptr + size;
     bool is_fastq = (ptr < end && *ptr == '@');
 
     while (ptr < end) {
         if (is_fastq) {
-            const char* name_end = (const char*)memchr(ptr, '\n', end - ptr);
-            if (!name_end) name_end = end;
+            const char *name_end = (const char *)memchr(ptr, '\n', end - ptr);
+            if (!name_end)
+                name_end = end;
             std::string name(ptr + 1, name_end - (ptr + 1));
-            if (!name.empty() && name.back() == '\r') name.pop_back();
+            if (!name.empty() && name.back() == '\r')
+                name.pop_back();
             ptr = name_end + (name_end < end ? 1 : 0);
 
-            if (ptr >= end) break;
-            const char* seq_end = (const char*)memchr(ptr, '\n', end - ptr);
-            if (!seq_end) seq_end = end;
+            if (ptr >= end)
+                break;
+            const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr);
+            if (!seq_end)
+                seq_end = end;
             std::string seq(ptr, seq_end - ptr);
-            if (!seq.empty() && seq.back() == '\r') seq.pop_back();
+            if (!seq.empty() && seq.back() == '\r')
+                seq.pop_back();
             ptr = seq_end + (seq_end < end ? 1 : 0);
 
             names.push_back(name);
             offsets.push_back(concat_seq.size());
             concat_seq += seq;
 
-            ptr = (const char*)memchr(ptr, '\n', end - ptr);
-            if (ptr) ptr++; else break;
-            ptr = (const char*)memchr(ptr, '\n', end - ptr);
-            if (ptr) ptr++; else break;
+            ptr = (const char *)memchr(ptr, '\n', end - ptr);
+            if (ptr)
+                ptr++;
+            else
+                break;
+            ptr = (const char *)memchr(ptr, '\n', end - ptr);
+            if (ptr)
+                ptr++;
+            else
+                break;
         } else {
             if (*ptr == '>') {
-                const char* name_end = (const char*)memchr(ptr, '\n', end - ptr);
-                if (!name_end) name_end = end;
+                const char *name_end = (const char *)memchr(ptr, '\n', end - ptr);
+                if (!name_end)
+                    name_end = end;
                 std::string name(ptr + 1, name_end - (ptr + 1));
-                if (!name.empty() && name.back() == '\r') name.pop_back();
+                if (!name.empty() && name.back() == '\r')
+                    name.pop_back();
                 names.push_back(name);
                 offsets.push_back(concat_seq.size());
                 ptr = name_end + (name_end < end ? 1 : 0);
             } else {
-                const char* seq_end = (const char*)memchr(ptr, '\n', end - ptr);
-                if (!seq_end) seq_end = end;
+                const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr);
+                if (!seq_end)
+                    seq_end = end;
                 size_t seq_len = seq_end - ptr;
-                if (seq_len > 0 && *(seq_end - 1) == '\r') seq_len--;
+                if (seq_len > 0 && *(seq_end - 1) == '\r')
+                    seq_len--;
                 concat_seq.append(ptr, seq_len);
                 ptr = seq_end + (seq_end < end ? 1 : 0);
             }
         }
     }
-    munmap((void*)mapped_ptr, size);
+    munmap((void *)mapped_ptr, size);
 }
 
 void load_reads(const std::string &filename, std::vector<Read> &reads) {
     int fd = open(filename.c_str(), O_RDONLY);
-    if (fd == -1) exit(1);
-    
+    if (fd == -1)
+        exit(1);
+
     struct stat sb;
-    if (fstat(fd, &sb) == -1) exit(1);
+    if (fstat(fd, &sb) == -1)
+        exit(1);
     size_t size = sb.st_size;
-    
-    const char* mapped_ptr = static_cast<const char*>(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0));
-    if (mapped_ptr == MAP_FAILED) exit(1);
+
+    const char *mapped_ptr = static_cast<const char *>(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0));
+    if (mapped_ptr == MAP_FAILED)
+        exit(1);
     close(fd);
 
-    reads.reserve(size / 150); 
+    reads.reserve(size / 150);
 
-    const char* ptr = mapped_ptr;
-    const char* end = ptr + size;
+    const char *ptr = mapped_ptr;
+    const char *end = ptr + size;
 
     while (ptr < end) {
         if (*ptr != '@') {
-            ptr = (const char*)memchr(ptr, '\n', end - ptr);
-            if (ptr) ptr++; else break;
+            ptr = (const char *)memchr(ptr, '\n', end - ptr);
+            if (ptr)
+                ptr++;
+            else
+                break;
             continue;
         }
 
         Read r;
-        const char* name_end = (const char*)memchr(ptr, '\n', end - ptr);
-        if (!name_end) name_end = end;
+        const char *name_end = (const char *)memchr(ptr, '\n', end - ptr);
+        if (!name_end)
+            name_end = end;
         r.name.assign(ptr + 1, name_end - (ptr + 1));
-        if (!r.name.empty() && r.name.back() == '\r') r.name.pop_back();
+        if (!r.name.empty() && r.name.back() == '\r')
+            r.name.pop_back();
         ptr = name_end + (name_end < end ? 1 : 0);
-        
-        if (ptr >= end) break;
 
-        const char* seq_end = (const char*)memchr(ptr, '\n', end - ptr);
-        if (!seq_end) seq_end = end;
+        if (ptr >= end)
+            break;
+
+        const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr);
+        if (!seq_end)
+            seq_end = end;
         r.seq.assign(ptr, seq_end - ptr);
-        if (!r.seq.empty() && r.seq.back() == '\r') r.seq.pop_back();
+        if (!r.seq.empty() && r.seq.back() == '\r')
+            r.seq.pop_back();
         ptr = seq_end + (seq_end < end ? 1 : 0);
 
-        ptr = (const char*)memchr(ptr, '\n', end - ptr);
-        if (ptr) ptr++; else break;
-        ptr = (const char*)memchr(ptr, '\n', end - ptr);
-        if (ptr) ptr++; else break;
+        ptr = (const char *)memchr(ptr, '\n', end - ptr);
+        if (ptr)
+            ptr++;
+        else
+            break;
+        ptr = (const char *)memchr(ptr, '\n', end - ptr);
+        if (ptr)
+            ptr++;
+        else
+            break;
 
         reads.push_back(std::move(r));
     }
-    munmap((void*)mapped_ptr, size);
+    munmap((void *)mapped_ptr, size);
 }
 
-
 inline uint8_t char2val_cpu(char c) { return (c >> 1) & 0x03; }
 
 inline uint32_t pack_16mer(const char *seq) {
@@ -161,6 +244,23 @@ inline uint32_t pack_16mer(const char *seq) {
 void encode_sequence_cpu(const std::string &seq, uint32_t *buffer) {
     size_t length = seq.length();
     size_t encoded_size = (length + 15) / 16;
+
+    // 小任务串行，避免 OMP 开销
+    if (encoded_size < 4096) {
+        for (size_t i = 0; i < encoded_size; ++i) {
+            uint32_t packed = 0;
+            size_t start = i * 16;
+            for (int j = 0; j < 16; ++j) {
+                if (start + j < length) {
+                    uint8_t val = char2val_cpu(seq[start + j]);
+                    packed |= (uint32_t)(val << (30 - 2 * j));
+                }
+            }
+            buffer[i] = packed;
+        }
+        return;
+    }
+
 #pragma omp parallel for
     for (size_t i = 0; i < encoded_size; ++i) {
         uint32_t packed = 0;
@@ -181,16 +281,26 @@ __device__ inline uint8_t get_base(const uint32_t *packed_array, size_t index) {
     return (packed_array[array_idx] >> bit_pos) & 0x03;
 }
 
-__global__ void sw_extend_batched_kernel(const uint32_t *ref_packed, size_t ref_len, const uint32_t *reads_packed,
-                                         const int *read_lengths, const int *candidate_positions,
-                                         const int *candidate_read_indices, int total_candidates, int *out_scores,
-                                         int *out_best_pos) {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total_candidates)
+extern "C" __global__ void sw_extend_wavefront_kernel(const uint32_t *ref_packed, size_t ref_len,
+                                                      const uint32_t *reads_packed, const int *read_lengths,
+                                                      const int64_t *candidate_positions,
+                                                      const int *candidate_read_indices, int total_candidates,
+                                                      int *out_scores, int64_t *out_best_pos) {
+    int global_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+    int local_warp_id = threadIdx.x / 32;
+    int lane_id = threadIdx.x % 32;
+
+    if (global_warp_id >= total_candidates)
         return;
 
-    int ref_start = candidate_positions[tid];
-    int read_idx = candidate_read_indices[tid];
+    extern __shared__ short shared_buf[];
+
+    int warps_per_block = blockDim.x / 32;
+    short *H_row = &shared_buf[local_warp_id * MAX_READ_LEN];
+    short *Start_row = &shared_buf[local_warp_id * MAX_READ_LEN + (MAX_READ_LEN * warps_per_block)];
+
+    int64_t ref_start = candidate_positions[global_warp_id];
+    int read_idx = candidate_read_indices[global_warp_id];
     int read_len = read_lengths[read_idx];
 
     const uint32_t *my_read_packed = reads_packed + read_idx * (MAX_READ_LEN / 16);
@@ -200,69 +310,117 @@ __global__ void sw_extend_batched_kernel(const uint32_t *ref_packed, size_t ref_
         ref_region_len = ref_len - ref_start;
     }
 
-    int H_row[MAX_READ_LEN];
-    int Start_row[MAX_READ_LEN];
-    for (int i = 0; i < read_len; ++i) {
-        H_row[i] = 0;
-        Start_row[i] = -1;
+    int total_chunks = (read_len + 31) / 32;
+
+    for (int chunk = 0; chunk < total_chunks; ++chunk) {
+        int j = chunk * 32 + lane_id;
+        if (j < read_len) {
+            H_row[j] = 0;
+            Start_row[j] = -1;
+        }
     }
 
-    int max_score = 0;
-    int best_ref_start = ref_start;
-
-    for (int i = 0; i < ref_region_len; ++i) {
-        int current_ref_pos = ref_start + i;
-        uint8_t ref_base = get_base(ref_packed, current_ref_pos);
-
-        int h_diag = 0;
-        int start_diag = current_ref_pos;
-        int h_left = 0;
-        int start_left = -1;
-
-        for (int j = 0; j < read_len; ++j) {
-            uint8_t read_base = get_base(my_read_packed, j);
-            int match = (ref_base == read_base) ? MATCH_SCORE : MISMATCH_SCORE;
-
-            int score_diag = h_diag + match;
-            int score_up = H_row[j] + GAP_PENALTY;
-            int score_left = h_left + GAP_PENALTY;
-
-            int score = 0;
-            int start = current_ref_pos;
-
-            if (score_diag > 0 && score_diag >= score_up && score_diag >= score_left) {
-                score = score_diag;
-                start = (h_diag == 0) ? current_ref_pos : start_diag;
-            } else if (score_up > 0 && score_up >= score_left) {
-                score = score_up;
-                start = (H_row[j] == 0) ? current_ref_pos : Start_row[j];
-            } else if (score_left > 0) {
-                score = score_left;
-                start = (h_left == 0) ? current_ref_pos : start_left;
-            }
+    __syncwarp();
+
+    short max_score = 0;
+    short best_ref_offset = 0;
+
+    short h_diag_reg[8] = {0};
+    short start_diag_reg[8] = {0};
+
+    short next_h_reg[8] = {0};
+    short next_start_reg[8] = {-1};
+
+    int total_steps = ref_region_len + read_len - 1;
+
+    for (int step = 0; step < total_steps; ++step) {
+
+        for (int chunk = 0; chunk < total_chunks; ++chunk) {
+            int j = chunk * 32 + lane_id;
+            int i = step - j;
 
-            h_diag = H_row[j];
-            start_diag = Start_row[j];
+            if (i >= 0 && i < ref_region_len && j < read_len) {
+                int64_t current_ref_pos = ref_start + i;
+                uint8_t ref_base = get_base(ref_packed, current_ref_pos);
+                uint8_t read_base = get_base(my_read_packed, j);
 
-            H_row[j] = score;
-            Start_row[j] = start;
+                short match = (ref_base == read_base) ? MATCH_SCORE : MISMATCH_SCORE;
+
+                short h_up = H_row[j];
+                short start_up = Start_row[j];
+
+                short h_left = (j > 0) ? H_row[j - 1] : 0;
+                short start_left = (j > 0) ? Start_row[j - 1] : -1;
+
+                short score_diag = h_diag_reg[chunk] + match;
+                short score_up = h_up + GAP_PENALTY;
+                short score_left = h_left + GAP_PENALTY;
+
+                short score = 0;
+                short start = i;
+
+                if (score_diag > 0 && score_diag >= score_up && score_diag >= score_left) {
+                    score = score_diag;
+                    start = (h_diag_reg[chunk] == 0) ? i : start_diag_reg[chunk];
+                } else if (score_up > 0 && score_up >= score_left) {
+                    score = score_up;
+                    start = (h_up == 0) ? i : start_up;
+                } else if (score_left > 0) {
+                    score = score_left;
+                    start = (h_left == 0) ? i : start_left;
+                }
+
+                // 【修复核心】将当前步的左侧值，作为下一步的对角线值缓存！
+                h_diag_reg[chunk] = h_left;
+                start_diag_reg[chunk] = start_left;
+
+                next_h_reg[chunk] = score;
+                next_start_reg[chunk] = start;
+
+                if (score > max_score) {
+                    max_score = score;
+                    best_ref_offset = start;
+                }
+            }
+        }
 
-            h_left = score;
-            start_left = start;
+        __syncwarp();
 
-            if (score > max_score) {
-                max_score = score;
-                best_ref_start = start;
+        for (int chunk = 0; chunk < total_chunks; ++chunk) {
+            int j = chunk * 32 + lane_id;
+            int i = step - j;
+            if (i >= 0 && i < ref_region_len && j < read_len) {
+                H_row[j] = next_h_reg[chunk];
+                Start_row[j] = next_start_reg[chunk];
             }
         }
+
+        __syncwarp();
+    }
+
+    for (int offset = 16; offset > 0; offset /= 2) {
+        short shfl_score = __shfl_down_sync(0xffffffff, max_score, offset);
+        short shfl_offset = __shfl_down_sync(0xffffffff, best_ref_offset, offset);
+        if (shfl_score > max_score) {
+            max_score = shfl_score;
+            best_ref_offset = shfl_offset;
+        }
     }
 
-    out_scores[tid] = max_score;
-    out_best_pos[tid] = best_ref_start;
+    if (lane_id == 0) {
+        out_scores[global_warp_id] = max_score;
+        out_best_pos[global_warp_id] = ref_start + best_ref_offset;
+    }
 }
 
 int main() {
-    cudaFree(0); 
+    ScopedTimer t_all("main_total");
+
+    {
+        ScopedTimer t("cuda_warmup");
+        cudaFree(0);
+    }
+
     std::string ref_file = "reference.fasta";
     std::string reads_file = "reads.fastq";
     std::ofstream out_file("results.txt");
@@ -271,81 +429,136 @@ int main() {
     std::vector<size_t> ref_offsets;
     std::string concat_ref;
 
-    load_reference(ref_file, ref_names, concat_ref, ref_offsets);
+    {
+        ScopedTimer t("load_reference");
+        load_reference(ref_file, ref_names, concat_ref, ref_offsets);
+    }
 
-    std::vector<Read> reads;
-    load_reads(reads_file, reads);
+    {
+        ScopedTimer t("append_ref_padding");
+        concat_ref.append(100, 'N');
+    }
 
-    // Build kmer index with array and sort
-    std::vector<KmerPos> kmer_list;
-    if (concat_ref.length() >= KMER_SIZE) {
-        kmer_list.reserve(concat_ref.length() - KMER_SIZE + 1);
-        for (size_t i = 0; i + KMER_SIZE <= concat_ref.length(); i += 1) {
-            kmer_list.push_back({pack_16mer(concat_ref.c_str() + i), (int)i});
-        }
-        __gnu_parallel::sort(kmer_list.begin(), kmer_list.end());
+    std::vector<Read> reads;
+    {
+        ScopedTimer t("load_reads");
+        load_reads(reads_file, reads);
     }
 
+    int num_reads = reads.size();
+
     size_t ref_encoded_size = (concat_ref.length() + 15) / 16;
     uint32_t *h_ref_packed = new uint32_t[ref_encoded_size];
-    encode_sequence_cpu(concat_ref, h_ref_packed);
+
+    {
+        ScopedTimer t("encode_reference_cpu");
+        encode_sequence_cpu(concat_ref, h_ref_packed);
+    }
 
     uint32_t *d_ref_packed;
-    cudaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t));
-    cudaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), cudaMemcpyHostToDevice);
+    {
+        ScopedCudaEventTimer t("cuda_malloc_ref");
+        cudaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t));
+    }
+
+    {
+        ScopedCudaEventTimer t("h2d_ref_copy");
+        cudaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), cudaMemcpyHostToDevice);
+    }
 
-    int num_reads = reads.size();
     std::vector<uint32_t> h_reads_packed(num_reads * (MAX_READ_LEN / 16), 0);
     std::vector<int> h_read_lengths(num_reads, 0);
+    std::vector<KmerPos> kmer_list;
+    kmer_list.reserve(num_reads * 50);
 
-    std::vector<int> h_candidate_pos;
-    std::vector<int> h_candidate_read_idx;
+    for (int i = 0; i < num_reads; ++i) {
+        int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN);
+        h_read_lengths[i] = rlen;
+        encode_sequence_cpu(reads[i].seq, h_reads_packed.data() + i * (MAX_READ_LEN / 16));
 
-    // Parallelize seed matching
-    #pragma omp parallel
-    {
-        std::vector<int> local_cands_pos;
-        std::vector<int> local_cands_idx;
-
-        #pragma omp for schedule(dynamic, 100)
-        for (int i = 0; i < num_reads; ++i) {
-            int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN);
-            h_read_lengths[i] = rlen;
-
-            uint32_t *read_ptr = h_reads_packed.data() + i * (MAX_READ_LEN / 16);
-            encode_sequence_cpu(reads[i].seq, read_ptr);
-
-            std::vector<int> cands;
-            for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) {
-                uint32_t seed = pack_16mer(reads[i].seq.c_str() + offset);
-                KmerPos target = {seed, 0};
-                auto it = std::lower_bound(kmer_list.begin(), kmer_list.end(), target);
-                
-                int count = 0;
-                while (it != kmer_list.end() && it->kmer == seed && count < 100) {
-                    int ref_window_start = std::max(0, it->pos - offset - 50);
-                    cands.push_back(ref_window_start);
-                    ++it;
-                    ++count;
+        for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) {
+            kmer_list.push_back({pack_16mer(reads[i].seq.c_str() + offset), i, offset});
+        }
+    }
+    __gnu_parallel::sort(kmer_list.begin(), kmer_list.end());
+
+    // 22-bit prefix lookup
+    std::vector<uint32_t> lookup(LOOKUP_BUCKETS + 1, (uint32_t)kmer_list.size());
+    uint32_t prefix = 0;
+    lookup[0] = 0;
+    for (size_t i = 0; i < kmer_list.size(); ++i) {
+        uint32_t p = kmer_list[i].kmer >> LOOKUP_SHIFT;
+        while (prefix < p)
+            lookup[++prefix] = (uint32_t)i;
+    }
+    while (prefix < LOOKUP_BUCKETS)
+        lookup[++prefix] = (uint32_t)kmer_list.size();
+
+    std::vector<std::vector<int64_t>> read_cands(num_reads);
+
+    // rolling k-mer scan on reference
+    if (concat_ref.length() >= (KMER_SIZE + 100)) {
+        const int64_t max_pos = (int64_t)concat_ref.length() - KMER_SIZE - 100; // inclusive
+
+#pragma omp parallel
+        {
+            std::vector<std::vector<int64_t>> local_cands(num_reads);
+
+            const int tid = omp_get_thread_num();
+            const int nth = omp_get_num_threads();
+
+            const int64_t total = max_pos + 1; // pos in [0, max_pos]
+            const int64_t chunk = (total + nth - 1) / nth;
+            const int64_t begin = tid * chunk;
+            const int64_t end = std::min<int64_t>(total, begin + chunk);
+
+            if (begin < end) {
+                uint32_t kmer = pack_16mer(concat_ref.data() + begin);
+
+                for (int64_t pos = begin; pos < end; ++pos) {
+                    uint32_t p = kmer >> LOOKUP_SHIFT;
+
+                    for (uint32_t i = lookup[p]; i < lookup[p + 1]; ++i) {
+                        if (kmer_list[i].kmer == kmer) {
+                            int64_t start = pos - kmer_list[i].offset - 50;
+                            if (start >= 0)
+                                local_cands[kmer_list[i].read_id].push_back(start);
+                        }
+                    }
+
+                    // roll to next 16-mer
+                    if (pos + 1 < end) {
+                        kmer = (kmer << 2) | (uint32_t)char2val_cpu(concat_ref[pos + KMER_SIZE]);
+                    }
                 }
             }
 
-            __gnu_parallel::sort(cands.begin(), cands.end());
-            int last_added = -1000;
-            for (int pos : cands) {
-                if (pos > last_added + 50) {
-                    local_cands_pos.push_back(pos);
-                    local_cands_idx.push_back(i);
-                    last_added = pos;
+#pragma omp critical
+            {
+                for (int i = 0; i < num_reads; ++i) {
+                    read_cands[i].insert(read_cands[i].end(), local_cands[i].begin(), local_cands[i].end());
                 }
             }
         }
+    }
 
-        // Merge thread-local results
-        #pragma omp critical
-        {
-            h_candidate_pos.insert(h_candidate_pos.end(), local_cands_pos.begin(), local_cands_pos.end());
-            h_candidate_read_idx.insert(h_candidate_read_idx.end(), local_cands_idx.begin(), local_cands_idx.end());
+    std::vector<int64_t> h_candidate_pos;
+    std::vector<int> h_candidate_read_idx;
+
+    for (int i = 0; i < num_reads; ++i) {
+        if (read_cands[i].empty())
+            continue;
+        std::sort(read_cands[i].begin(), read_cands[i].end());
+        int64_t last_added = -1000;
+        int count = 0;
+        for (int64_t pos : read_cands[i]) {
+            if (pos > last_added + 50) {
+                h_candidate_pos.push_back(pos);
+                h_candidate_read_idx.push_back(i);
+                last_added = pos;
+                if (++count >= 100)
+                    break;
+            }
         }
     }
 
@@ -357,36 +570,43 @@ int main() {
     }
 
     uint32_t *d_reads_packed;
-    int *d_read_lengths, *d_candidate_pos, *d_candidate_read_idx, *d_out_scores, *d_out_best_pos;
+    int *d_read_lengths, *d_candidate_read_idx, *d_out_scores;
+    int64_t *d_candidate_pos, *d_out_best_pos;
 
     cudaMalloc(&d_reads_packed, h_reads_packed.size() * sizeof(uint32_t));
     cudaMalloc(&d_read_lengths, num_reads * sizeof(int));
-    cudaMalloc(&d_candidate_pos, total_candidates * sizeof(int));
+    cudaMalloc(&d_candidate_pos, total_candidates * sizeof(int64_t));
     cudaMalloc(&d_candidate_read_idx, total_candidates * sizeof(int));
     cudaMalloc(&d_out_scores, total_candidates * sizeof(int));
-    cudaMalloc(&d_out_best_pos, total_candidates * sizeof(int));
+    cudaMalloc(&d_out_best_pos, total_candidates * sizeof(int64_t));
 
     cudaMemcpy(d_reads_packed, h_reads_packed.data(), h_reads_packed.size() * sizeof(uint32_t), cudaMemcpyHostToDevice);
     cudaMemcpy(d_read_lengths, h_read_lengths.data(), num_reads * sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_candidate_pos, h_candidate_pos.data(), total_candidates * sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_candidate_pos, h_candidate_pos.data(), total_candidates * sizeof(int64_t), cudaMemcpyHostToDevice);
     cudaMemcpy(d_candidate_read_idx, h_candidate_read_idx.data(), total_candidates * sizeof(int),
                cudaMemcpyHostToDevice);
 
-    int blockSize = 256;
-    int numBlocks = (total_candidates + blockSize - 1) / blockSize;
+    int blockSize = 128; // 4 Warps per Block
+    // Total threads needed = total_candidates * 32
+    int numBlocks = (total_candidates * 32 + blockSize - 1) / blockSize;
+
+    int warps_per_block = blockSize / 32;
+    // Shared memory: warps_per_block * (MAX_READ_LEN * 2 arrays) * sizeof(short)
+    size_t sharedMemSize = warps_per_block * MAX_READ_LEN * 2 * sizeof(short);
+
+    sw_extend_wavefront_kernel<<<numBlocks, blockSize, sharedMemSize>>>(
+        d_ref_packed, concat_ref.length() - 100, d_reads_packed, d_read_lengths, d_candidate_pos, d_candidate_read_idx,
+        total_candidates, d_out_scores, d_out_best_pos);
 
-    sw_extend_batched_kernel<<<numBlocks, blockSize>>>(d_ref_packed, concat_ref.length(), d_reads_packed,
-                                                       d_read_lengths, d_candidate_pos, d_candidate_read_idx,
-                                                       total_candidates, d_out_scores, d_out_best_pos);
     cudaDeviceSynchronize();
 
     std::vector<int> h_out_scores(total_candidates);
-    std::vector<int> h_out_best_pos(total_candidates);
+    std::vector<int64_t> h_out_best_pos(total_candidates);
     cudaMemcpy(h_out_scores.data(), d_out_scores, total_candidates * sizeof(int), cudaMemcpyDeviceToHost);
-    cudaMemcpy(h_out_best_pos.data(), d_out_best_pos, total_candidates * sizeof(int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_out_best_pos.data(), d_out_best_pos, total_candidates * sizeof(int64_t), cudaMemcpyDeviceToHost);
 
     std::vector<int> best_score_per_read(num_reads, -1);
-    std::vector<int> best_pos_per_read(num_reads, -1);
+    std::vector<int64_t> best_pos_per_read(num_reads, -1);
 
     for (int i = 0; i < total_candidates; ++i) {
         int r_idx = h_candidate_read_idx[i];
@@ -398,10 +618,10 @@ int main() {
 
     for (int i = 0; i < num_reads; ++i) {
         if (best_score_per_read[i] > h_read_lengths[i] * 0.5) {
-            int global_pos = best_pos_per_read[i];
+            int64_t global_pos = best_pos_per_read[i];
             auto it = std::upper_bound(ref_offsets.begin(), ref_offsets.end(), global_pos);
             int ref_idx = std::distance(ref_offsets.begin(), it) - 1;
-            int local_pos = global_pos - ref_offsets[ref_idx];
+            int64_t local_pos = global_pos - ref_offsets[ref_idx];
 
             out_file << reads[i].name << " " << ref_names[ref_idx] << " " << local_pos << "\n";
         } else {
@@ -419,4 +639,4 @@ int main() {
     delete[] h_ref_packed;
 
     return 0;
-}
\ No newline at end of file
+}

From 5439885bbed77a43781cb0234ed9c0e2d81163ca Mon Sep 17 00:00:00 2001
From: yingziyu-llt <yingziyu-Lin@outlook.com>
Date: Sat, 14 Mar 2026 17:14:06 +0800
Subject: [PATCH 7/8] update all

---
 .../generate_benchmark.py"                    |  8 +--
 .../main.cu"                                  | 71 +------------------
 2 files changed, 6 insertions(+), 73 deletions(-)
 rename 05_seq_align/generate_benchmark.py => "05_seq_align/\346\236\227\344\271\220\345\244\251/generate_benchmark.py" (96%)
 rename 05_seq_align/main.cu => "05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu" (92%)

diff --git a/05_seq_align/generate_benchmark.py "b/05_seq_align/\346\236\227\344\271\220\345\244\251/generate_benchmark.py"
similarity index 96%
rename from 05_seq_align/generate_benchmark.py
rename to "05_seq_align/\346\236\227\344\271\220\345\244\251/generate_benchmark.py"
index 47333e8..43a3687 100644
--- a/05_seq_align/generate_benchmark.py
+++ "b/05_seq_align/\346\236\227\344\271\220\345\244\251/generate_benchmark.py"
@@ -1,9 +1,9 @@
 import random
 import os
 
-NUM_CHROMOSOMES = 2
-CHROM_LENGTH = 50000
-NUM_READS = 1000
+NUM_CHROMOSOMES = 100
+CHROM_LENGTH = 100000000
+NUM_READS = 10000
 READ_LENGTH = 100
 MUTATION_RATE = 0.05
 UNKNOWN_RATE = 0.1
@@ -87,4 +87,4 @@ def generate_fastq_and_truth(chromosomes, fastq_file, truth_file):
     print("Generating reads and ground truth...")
     generate_fastq_and_truth(chroms, "reads.fastq", "truth.txt")
     
-    print("Benchmark data generated successfully.")
\ No newline at end of file
+    print("Benchmark data generated successfully.")
diff --git a/05_seq_align/main.cu "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu"
similarity index 92%
rename from 05_seq_align/main.cu
rename to "05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu"
index 6e933f6..0327b9f 100644
--- a/05_seq_align/main.cu
+++ "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu"
@@ -13,52 +13,6 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #include <vector>
-#ifdef USE_NVTX
-#include <nvToolsExt.h>
-#endif
-
-struct ScopedTimer {
-    const char *name;
-    std::chrono::high_resolution_clock::time_point t0;
-    ScopedTimer(const char *n) : name(n), t0(std::chrono::high_resolution_clock::now()) {
-#ifdef USE_NVTX
-        nvtxRangePushA(name);
-#endif
-    }
-    ~ScopedTimer() {
-        auto t1 = std::chrono::high_resolution_clock::now();
-        double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
-        std::fprintf(stderr, "[TIMER][CPU] %s: %.3f ms\n", name, ms);
-#ifdef USE_NVTX
-        nvtxRangePop();
-#endif
-    }
-};
-
-struct ScopedCudaEventTimer {
-    const char *name;
-    cudaEvent_t ev0{}, ev1{};
-    ScopedCudaEventTimer(const char *n) : name(n) {
-#ifdef USE_NVTX
-        nvtxRangePushA(name);
-#endif
-        cudaEventCreate(&ev0);
-        cudaEventCreate(&ev1);
-        cudaEventRecord(ev0);
-    }
-    ~ScopedCudaEventTimer() {
-        cudaEventRecord(ev1);
-        cudaEventSynchronize(ev1);
-        float ms = 0.0f;
-        cudaEventElapsedTime(&ms, ev0, ev1);
-        std::fprintf(stderr, "[TIMER][GPU] %s: %.3f ms\n", name, ms);
-        cudaEventDestroy(ev0);
-        cudaEventDestroy(ev1);
-#ifdef USE_NVTX
-        nvtxRangePop();
-#endif
-    }
-};
 
 #define MATCH_SCORE 2
 #define MISMATCH_SCORE -1
@@ -414,12 +368,7 @@ extern "C" __global__ void sw_extend_wavefront_kernel(const uint32_t *ref_packed
 }
 
 int main() {
-    ScopedTimer t_all("main_total");
-
-    {
-        ScopedTimer t("cuda_warmup");
         cudaFree(0);
-    }
 
     std::string ref_file = "reference.fasta";
     std::string reads_file = "reads.fastq";
@@ -429,42 +378,26 @@ int main() {
     std::vector<size_t> ref_offsets;
     std::string concat_ref;
 
-    {
-        ScopedTimer t("load_reference");
         load_reference(ref_file, ref_names, concat_ref, ref_offsets);
-    }
 
-    {
-        ScopedTimer t("append_ref_padding");
         concat_ref.append(100, 'N');
-    }
 
     std::vector<Read> reads;
-    {
-        ScopedTimer t("load_reads");
         load_reads(reads_file, reads);
-    }
 
     int num_reads = reads.size();
 
     size_t ref_encoded_size = (concat_ref.length() + 15) / 16;
     uint32_t *h_ref_packed = new uint32_t[ref_encoded_size];
 
-    {
-        ScopedTimer t("encode_reference_cpu");
         encode_sequence_cpu(concat_ref, h_ref_packed);
-    }
 
     uint32_t *d_ref_packed;
-    {
-        ScopedCudaEventTimer t("cuda_malloc_ref");
         cudaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t));
-    }
+    
 
-    {
-        ScopedCudaEventTimer t("h2d_ref_copy");
         cudaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), cudaMemcpyHostToDevice);
-    }
+    
 
     std::vector<uint32_t> h_reads_packed(num_reads * (MAX_READ_LEN / 16), 0);
     std::vector<int> h_read_lengths(num_reads, 0);

From 5a77c7f9021773add7888be473d5ca44dd82a92f Mon Sep 17 00:00:00 2001
From: yingziyu-llt <yingziyu-Lin@outlook.com>
Date: Sat, 14 Mar 2026 17:22:32 +0800
Subject: [PATCH 8/8] finalize

---
 .../main.cu"                                  |  16 +-
 .../main.maca"                                | 572 +++++++++++++++++
 .../main.mu"                                  | 575 ++++++++++++++++++
 3 files changed, 1154 insertions(+), 9 deletions(-)
 create mode 100644 "05_seq_align/\346\236\227\344\271\220\345\244\251/main.maca"
 create mode 100644 "05_seq_align/\346\236\227\344\271\220\345\244\251/main.mu"

diff --git "a/05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu" "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu"
index 0327b9f..2da1335 100644
--- "a/05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu"
+++ "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu"
@@ -368,7 +368,7 @@ extern "C" __global__ void sw_extend_wavefront_kernel(const uint32_t *ref_packed
 }
 
 int main() {
-        cudaFree(0);
+    cudaFree(0);
 
     std::string ref_file = "reference.fasta";
     std::string reads_file = "reads.fastq";
@@ -378,26 +378,24 @@ int main() {
     std::vector<size_t> ref_offsets;
     std::string concat_ref;
 
-        load_reference(ref_file, ref_names, concat_ref, ref_offsets);
+    load_reference(ref_file, ref_names, concat_ref, ref_offsets);
 
-        concat_ref.append(100, 'N');
+    concat_ref.append(100, 'N');
 
     std::vector<Read> reads;
-        load_reads(reads_file, reads);
+    load_reads(reads_file, reads);
 
     int num_reads = reads.size();
 
     size_t ref_encoded_size = (concat_ref.length() + 15) / 16;
     uint32_t *h_ref_packed = new uint32_t[ref_encoded_size];
 
-        encode_sequence_cpu(concat_ref, h_ref_packed);
+    encode_sequence_cpu(concat_ref, h_ref_packed);
 
     uint32_t *d_ref_packed;
-        cudaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t));
-    
+    cudaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t));
 
-        cudaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), cudaMemcpyHostToDevice);
-    
+    cudaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), cudaMemcpyHostToDevice);
 
     std::vector<uint32_t> h_reads_packed(num_reads * (MAX_READ_LEN / 16), 0);
     std::vector<int> h_read_lengths(num_reads, 0);
diff --git "a/05_seq_align/\346\236\227\344\271\220\345\244\251/main.maca" "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.maca"
new file mode 100644
index 0000000..207b926
--- /dev/null
+++ "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.maca"
@@ -0,0 +1,572 @@
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <fcntl.h>
+#include <fstream>
+#include <iostream>
+#include <maca_runtime.h>
+#include <omp.h>
+#include <parallel/algorithm>
+#include <string>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <vector>
+
+#define MATCH_SCORE 2
+#define MISMATCH_SCORE -1
+#define GAP_PENALTY -1
+#define MAX_READ_LEN 256
+#define KMER_SIZE 16
+constexpr int LOOKUP_PREFIX_BITS = 22;
+constexpr uint32_t LOOKUP_BUCKETS = (1u << LOOKUP_PREFIX_BITS);
+constexpr int LOOKUP_SHIFT = 32 - LOOKUP_PREFIX_BITS;
+
+struct Read {
+    std::string name;
+    std::string seq;
+};
+
+// Modified to hold read kmers instead of reference kmers
+struct KmerPos {
+    uint32_t kmer;
+    int read_id;
+    int offset;
+    bool operator<(const KmerPos &other) const { return kmer < other.kmer; }
+};
+
+void load_reference(const std::string &filename, std::vector<std::string> &names, std::string &concat_seq,
+                    std::vector<size_t> &offsets) {
+    int fd = open(filename.c_str(), O_RDONLY);
+    if (fd == -1)
+        exit(1);
+
+    struct stat sb;
+    if (fstat(fd, &sb) == -1)
+        exit(1);
+    size_t size = sb.st_size;
+
+    const char *mapped_ptr = static_cast<const char *>(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0));
+    if (mapped_ptr == MAP_FAILED)
+        exit(1);
+    close(fd);
+
+    concat_seq.reserve(size);
+
+    const char *ptr = mapped_ptr;
+    const char *end = ptr + size;
+    bool is_fastq = (ptr < end && *ptr == '@');
+
+    while (ptr < end) {
+        if (is_fastq) {
+            const char *name_end = (const char *)memchr(ptr, '\n', end - ptr);
+            if (!name_end)
+                name_end = end;
+            std::string name(ptr + 1, name_end - (ptr + 1));
+            if (!name.empty() && name.back() == '\r')
+                name.pop_back();
+            ptr = name_end + (name_end < end ? 1 : 0);
+
+            if (ptr >= end)
+                break;
+            const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr);
+            if (!seq_end)
+                seq_end = end;
+            std::string seq(ptr, seq_end - ptr);
+            if (!seq.empty() && seq.back() == '\r')
+                seq.pop_back();
+            ptr = seq_end + (seq_end < end ? 1 : 0);
+
+            names.push_back(name);
+            offsets.push_back(concat_seq.size());
+            concat_seq += seq;
+
+            ptr = (const char *)memchr(ptr, '\n', end - ptr);
+            if (ptr)
+                ptr++;
+            else
+                break;
+            ptr = (const char *)memchr(ptr, '\n', end - ptr);
+            if (ptr)
+                ptr++;
+            else
+                break;
+        } else {
+            if (*ptr == '>') {
+                const char *name_end = (const char *)memchr(ptr, '\n', end - ptr);
+                if (!name_end)
+                    name_end = end;
+                std::string name(ptr + 1, name_end - (ptr + 1));
+                if (!name.empty() && name.back() == '\r')
+                    name.pop_back();
+                names.push_back(name);
+                offsets.push_back(concat_seq.size());
+                ptr = name_end + (name_end < end ? 1 : 0);
+            } else {
+                const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr);
+                if (!seq_end)
+                    seq_end = end;
+                size_t seq_len = seq_end - ptr;
+                if (seq_len > 0 && *(seq_end - 1) == '\r')
+                    seq_len--;
+                concat_seq.append(ptr, seq_len);
+                ptr = seq_end + (seq_end < end ? 1 : 0);
+            }
+        }
+    }
+    munmap((void *)mapped_ptr, size);
+}
+
+void load_reads(const std::string &filename, std::vector<Read> &reads) {
+    int fd = open(filename.c_str(), O_RDONLY);
+    if (fd == -1)
+        exit(1);
+
+    struct stat sb;
+    if (fstat(fd, &sb) == -1)
+        exit(1);
+    size_t size = sb.st_size;
+
+    const char *mapped_ptr = static_cast<const char *>(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0));
+    if (mapped_ptr == MAP_FAILED)
+        exit(1);
+    close(fd);
+
+    reads.reserve(size / 150);
+
+    const char *ptr = mapped_ptr;
+    const char *end = ptr + size;
+
+    while (ptr < end) {
+        if (*ptr != '@') {
+            ptr = (const char *)memchr(ptr, '\n', end - ptr);
+            if (ptr)
+                ptr++;
+            else
+                break;
+            continue;
+        }
+
+        Read r;
+        const char *name_end = (const char *)memchr(ptr, '\n', end - ptr);
+        if (!name_end)
+            name_end = end;
+        r.name.assign(ptr + 1, name_end - (ptr + 1));
+        if (!r.name.empty() && r.name.back() == '\r')
+            r.name.pop_back();
+        ptr = name_end + (name_end < end ? 1 : 0);
+
+        if (ptr >= end)
+            break;
+
+        const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr);
+        if (!seq_end)
+            seq_end = end;
+        r.seq.assign(ptr, seq_end - ptr);
+        if (!r.seq.empty() && r.seq.back() == '\r')
+            r.seq.pop_back();
+        ptr = seq_end + (seq_end < end ? 1 : 0);
+
+        ptr = (const char *)memchr(ptr, '\n', end - ptr);
+        if (ptr)
+            ptr++;
+        else
+            break;
+        ptr = (const char *)memchr(ptr, '\n', end - ptr);
+        if (ptr)
+            ptr++;
+        else
+            break;
+
+        reads.push_back(std::move(r));
+    }
+    munmap((void *)mapped_ptr, size);
+}
+
+inline uint8_t char2val_cpu(char c) { return (c >> 1) & 0x03; }
+
+inline uint32_t pack_16mer(const char *seq) {
+    uint32_t packed = 0;
+    for (int i = 0; i < 16; ++i) {
+        uint8_t val = char2val_cpu(seq[i]);
+        packed |= (val << (30 - 2 * i));
+    }
+    return packed;
+}
+
+void encode_sequence_cpu(const std::string &seq, uint32_t *buffer) {
+    size_t length = seq.length();
+    size_t encoded_size = (length + 15) / 16;
+
+    // 小任务串行，避免 OMP 开销
+    if (encoded_size < 4096) {
+        for (size_t i = 0; i < encoded_size; ++i) {
+            uint32_t packed = 0;
+            size_t start = i * 16;
+            for (int j = 0; j < 16; ++j) {
+                if (start + j < length) {
+                    uint8_t val = char2val_cpu(seq[start + j]);
+                    packed |= (uint32_t)(val << (30 - 2 * j));
+                }
+            }
+            buffer[i] = packed;
+        }
+        return;
+    }
+
+#pragma omp parallel for
+    for (size_t i = 0; i < encoded_size; ++i) {
+        uint32_t packed = 0;
+        size_t start = i * 16;
+        for (int j = 0; j < 16; ++j) {
+            if (start + j < length) {
+                uint8_t val = char2val_cpu(seq[start + j]);
+                packed |= (uint32_t)(val << (30 - 2 * j));
+            }
+        }
+        buffer[i] = packed;
+    }
+}
+
+__device__ inline uint8_t get_base(const uint32_t *packed_array, size_t index) {
+    size_t array_idx = index / 16;
+    int bit_pos = 30 - 2 * (index % 16);
+    return (packed_array[array_idx] >> bit_pos) & 0x03;
+}
+
+extern "C" __global__ void sw_extend_wavefront_kernel(const uint32_t *ref_packed, size_t ref_len,
+                                                      const uint32_t *reads_packed, const int *read_lengths,
+                                                      const int64_t *candidate_positions,
+                                                      const int *candidate_read_indices, int total_candidates,
+                                                      int *out_scores, int64_t *out_best_pos) {
+    int global_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+    int local_warp_id = threadIdx.x / 32;
+    int lane_id = threadIdx.x % 32;
+
+    if (global_warp_id >= total_candidates)
+        return;
+
+    extern __shared__ short shared_buf[];
+
+    int warps_per_block = blockDim.x / 32;
+    short *H_row = &shared_buf[local_warp_id * MAX_READ_LEN];
+    short *Start_row = &shared_buf[local_warp_id * MAX_READ_LEN + (MAX_READ_LEN * warps_per_block)];
+
+    int64_t ref_start = candidate_positions[global_warp_id];
+    int read_idx = candidate_read_indices[global_warp_id];
+    int read_len = read_lengths[read_idx];
+
+    const uint32_t *my_read_packed = reads_packed + read_idx * (MAX_READ_LEN / 16);
+
+    int ref_region_len = read_len + 100;
+    if (ref_start + ref_region_len > ref_len) {
+        ref_region_len = ref_len - ref_start;
+    }
+
+    int total_chunks = (read_len + 31) / 32;
+
+    for (int chunk = 0; chunk < total_chunks; ++chunk) {
+        int j = chunk * 32 + lane_id;
+        if (j < read_len) {
+            H_row[j] = 0;
+            Start_row[j] = -1;
+        }
+    }
+
+    __syncwarp();
+
+    short max_score = 0;
+    short best_ref_offset = 0;
+
+    short h_diag_reg[8] = {0};
+    short start_diag_reg[8] = {0};
+
+    short next_h_reg[8] = {0};
+    short next_start_reg[8] = {-1};
+
+    int total_steps = ref_region_len + read_len - 1;
+
+    for (int step = 0; step < total_steps; ++step) {
+
+        for (int chunk = 0; chunk < total_chunks; ++chunk) {
+            int j = chunk * 32 + lane_id;
+            int i = step - j;
+
+            if (i >= 0 && i < ref_region_len && j < read_len) {
+                int64_t current_ref_pos = ref_start + i;
+                uint8_t ref_base = get_base(ref_packed, current_ref_pos);
+                uint8_t read_base = get_base(my_read_packed, j);
+
+                short match = (ref_base == read_base) ? MATCH_SCORE : MISMATCH_SCORE;
+
+                short h_up = H_row[j];
+                short start_up = Start_row[j];
+
+                short h_left = (j > 0) ? H_row[j - 1] : 0;
+                short start_left = (j > 0) ? Start_row[j - 1] : -1;
+
+                short score_diag = h_diag_reg[chunk] + match;
+                short score_up = h_up + GAP_PENALTY;
+                short score_left = h_left + GAP_PENALTY;
+
+                short score = 0;
+                short start = i;
+
+                if (score_diag > 0 && score_diag >= score_up && score_diag >= score_left) {
+                    score = score_diag;
+                    start = (h_diag_reg[chunk] == 0) ? i : start_diag_reg[chunk];
+                } else if (score_up > 0 && score_up >= score_left) {
+                    score = score_up;
+                    start = (h_up == 0) ? i : start_up;
+                } else if (score_left > 0) {
+                    score = score_left;
+                    start = (h_left == 0) ? i : start_left;
+                }
+
+                // 【修复核心】将当前步的左侧值，作为下一步的对角线值缓存！
+                h_diag_reg[chunk] = h_left;
+                start_diag_reg[chunk] = start_left;
+
+                next_h_reg[chunk] = score;
+                next_start_reg[chunk] = start;
+
+                if (score > max_score) {
+                    max_score = score;
+                    best_ref_offset = start;
+                }
+            }
+        }
+
+        __syncwarp();
+
+        for (int chunk = 0; chunk < total_chunks; ++chunk) {
+            int j = chunk * 32 + lane_id;
+            int i = step - j;
+            if (i >= 0 && i < ref_region_len && j < read_len) {
+                H_row[j] = next_h_reg[chunk];
+                Start_row[j] = next_start_reg[chunk];
+            }
+        }
+
+        __syncwarp();
+    }
+
+    for (int offset = 16; offset > 0; offset /= 2) {
+        short shfl_score = __shfl_down_sync(0xffffffff, max_score, offset);
+        short shfl_offset = __shfl_down_sync(0xffffffff, best_ref_offset, offset);
+        if (shfl_score > max_score) {
+            max_score = shfl_score;
+            best_ref_offset = shfl_offset;
+        }
+    }
+
+    if (lane_id == 0) {
+        out_scores[global_warp_id] = max_score;
+        out_best_pos[global_warp_id] = ref_start + best_ref_offset;
+    }
+}
+
+int main() {
+    mcFree(0);
+
+    std::string ref_file = "reference.fasta";
+    std::string reads_file = "reads.fastq";
+    std::ofstream out_file("results.txt");
+
+    std::vector<std::string> ref_names;
+    std::vector<size_t> ref_offsets;
+    std::string concat_ref;
+
+    load_reference(ref_file, ref_names, concat_ref, ref_offsets);
+
+    concat_ref.append(100, 'N');
+
+    std::vector<Read> reads;
+    load_reads(reads_file, reads);
+
+    int num_reads = reads.size();
+
+    size_t ref_encoded_size = (concat_ref.length() + 15) / 16;
+    uint32_t *h_ref_packed = new uint32_t[ref_encoded_size];
+
+    encode_sequence_cpu(concat_ref, h_ref_packed);
+
+    uint32_t *d_ref_packed;
+    mcMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t));
+
+    mcMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), mcMemcpyHostToDevice);
+
+    std::vector<uint32_t> h_reads_packed(num_reads * (MAX_READ_LEN / 16), 0);
+    std::vector<int> h_read_lengths(num_reads, 0);
+    std::vector<KmerPos> kmer_list;
+    kmer_list.reserve(num_reads * 50);
+
+    for (int i = 0; i < num_reads; ++i) {
+        int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN);
+        h_read_lengths[i] = rlen;
+        encode_sequence_cpu(reads[i].seq, h_reads_packed.data() + i * (MAX_READ_LEN / 16));
+
+        for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) {
+            kmer_list.push_back({pack_16mer(reads[i].seq.c_str() + offset), i, offset});
+        }
+    }
+    __gnu_parallel::sort(kmer_list.begin(), kmer_list.end());
+
+    // 22-bit prefix lookup
+    std::vector<uint32_t> lookup(LOOKUP_BUCKETS + 1, (uint32_t)kmer_list.size());
+    uint32_t prefix = 0;
+    lookup[0] = 0;
+    for (size_t i = 0; i < kmer_list.size(); ++i) {
+        uint32_t p = kmer_list[i].kmer >> LOOKUP_SHIFT;
+        while (prefix < p)
+            lookup[++prefix] = (uint32_t)i;
+    }
+    while (prefix < LOOKUP_BUCKETS)
+        lookup[++prefix] = (uint32_t)kmer_list.size();
+
+    std::vector<std::vector<int64_t>> read_cands(num_reads);
+
+    // rolling k-mer scan on reference
+    if (concat_ref.length() >= (KMER_SIZE + 100)) {
+        const int64_t max_pos = (int64_t)concat_ref.length() - KMER_SIZE - 100; // inclusive
+
+#pragma omp parallel
+        {
+            std::vector<std::vector<int64_t>> local_cands(num_reads);
+
+            const int tid = omp_get_thread_num();
+            const int nth = omp_get_num_threads();
+
+            const int64_t total = max_pos + 1; // pos in [0, max_pos]
+            const int64_t chunk = (total + nth - 1) / nth;
+            const int64_t begin = tid * chunk;
+            const int64_t end = std::min<int64_t>(total, begin + chunk);
+
+            if (begin < end) {
+                uint32_t kmer = pack_16mer(concat_ref.data() + begin);
+
+                for (int64_t pos = begin; pos < end; ++pos) {
+                    uint32_t p = kmer >> LOOKUP_SHIFT;
+
+                    for (uint32_t i = lookup[p]; i < lookup[p + 1]; ++i) {
+                        if (kmer_list[i].kmer == kmer) {
+                            int64_t start = pos - kmer_list[i].offset - 50;
+                            if (start >= 0)
+                                local_cands[kmer_list[i].read_id].push_back(start);
+                        }
+                    }
+
+                    // roll to next 16-mer
+                    if (pos + 1 < end) {
+                        kmer = (kmer << 2) | (uint32_t)char2val_cpu(concat_ref[pos + KMER_SIZE]);
+                    }
+                }
+            }
+
+#pragma omp critical
+            {
+                for (int i = 0; i < num_reads; ++i) {
+                    read_cands[i].insert(read_cands[i].end(), local_cands[i].begin(), local_cands[i].end());
+                }
+            }
+        }
+    }
+
+    std::vector<int64_t> h_candidate_pos;
+    std::vector<int> h_candidate_read_idx;
+
+    for (int i = 0; i < num_reads; ++i) {
+        if (read_cands[i].empty())
+            continue;
+        std::sort(read_cands[i].begin(), read_cands[i].end());
+        int64_t last_added = -1000;
+        int count = 0;
+        for (int64_t pos : read_cands[i]) {
+            if (pos > last_added + 50) {
+                h_candidate_pos.push_back(pos);
+                h_candidate_read_idx.push_back(i);
+                last_added = pos;
+                if (++count >= 100)
+                    break;
+            }
+        }
+    }
+
+    int total_candidates = h_candidate_pos.size();
+    if (total_candidates == 0) {
+        for (const auto &r : reads)
+            out_file << r.name << " unknown_origin\n";
+        return 0;
+    }
+
+    uint32_t *d_reads_packed;
+    int *d_read_lengths, *d_candidate_read_idx, *d_out_scores;
+    int64_t *d_candidate_pos, *d_out_best_pos;
+
+    mcMalloc(&d_reads_packed, h_reads_packed.size() * sizeof(uint32_t));
+    mcMalloc(&d_read_lengths, num_reads * sizeof(int));
+    mcMalloc(&d_candidate_pos, total_candidates * sizeof(int64_t));
+    mcMalloc(&d_candidate_read_idx, total_candidates * sizeof(int));
+    mcMalloc(&d_out_scores, total_candidates * sizeof(int));
+    mcMalloc(&d_out_best_pos, total_candidates * sizeof(int64_t));
+
+    mcMemcpy(d_reads_packed, h_reads_packed.data(), h_reads_packed.size() * sizeof(uint32_t), mcMemcpyHostToDevice);
+    mcMemcpy(d_read_lengths, h_read_lengths.data(), num_reads * sizeof(int), mcMemcpyHostToDevice);
+    mcMemcpy(d_candidate_pos, h_candidate_pos.data(), total_candidates * sizeof(int64_t), mcMemcpyHostToDevice);
+    mcMemcpy(d_candidate_read_idx, h_candidate_read_idx.data(), total_candidates * sizeof(int), mcMemcpyHostToDevice);
+
+    int blockSize = 128; // 4 Warps per Block
+    // Total threads needed = total_candidates * 32
+    int numBlocks = (total_candidates * 32 + blockSize - 1) / blockSize;
+
+    int warps_per_block = blockSize / 32;
+    // Shared memory: warps_per_block * (MAX_READ_LEN * 2 arrays) * sizeof(short)
+    size_t sharedMemSize = warps_per_block * MAX_READ_LEN * 2 * sizeof(short);
+
+    sw_extend_wavefront_kernel<<<numBlocks, blockSize, sharedMemSize>>>(
+        d_ref_packed, concat_ref.length() - 100, d_reads_packed, d_read_lengths, d_candidate_pos, d_candidate_read_idx,
+        total_candidates, d_out_scores, d_out_best_pos);
+
+    mcDeviceSynchronize();
+
+    std::vector<int> h_out_scores(total_candidates);
+    std::vector<int64_t> h_out_best_pos(total_candidates);
+    mcMemcpy(h_out_scores.data(), d_out_scores, total_candidates * sizeof(int), mcMemcpyDeviceToHost);
+    mcMemcpy(h_out_best_pos.data(), d_out_best_pos, total_candidates * sizeof(int64_t), mcMemcpyDeviceToHost);
+
+    std::vector<int> best_score_per_read(num_reads, -1);
+    std::vector<int64_t> best_pos_per_read(num_reads, -1);
+
+    for (int i = 0; i < total_candidates; ++i) {
+        int r_idx = h_candidate_read_idx[i];
+        if (h_out_scores[i] > best_score_per_read[r_idx]) {
+            best_score_per_read[r_idx] = h_out_scores[i];
+            best_pos_per_read[r_idx] = h_out_best_pos[i];
+        }
+    }
+
+    for (int i = 0; i < num_reads; ++i) {
+        if (best_score_per_read[i] > h_read_lengths[i] * 0.5) {
+            int64_t global_pos = best_pos_per_read[i];
+            auto it = std::upper_bound(ref_offsets.begin(), ref_offsets.end(), global_pos);
+            int ref_idx = std::distance(ref_offsets.begin(), it) - 1;
+            int64_t local_pos = global_pos - ref_offsets[ref_idx];
+
+            out_file << reads[i].name << " " << ref_names[ref_idx] << " " << local_pos << "\n";
+        } else {
+            out_file << reads[i].name << " unknown_origin\n";
+        }
+    }
+
+    mcFree(d_ref_packed);
+    mcFree(d_reads_packed);
+    mcFree(d_read_lengths);
+    mcFree(d_candidate_pos);
+    mcFree(d_candidate_read_idx);
+    mcFree(d_out_scores);
+    mcFree(d_out_best_pos);
+    delete[] h_ref_packed;
+
+    return 0;
+}
diff --git "a/05_seq_align/\346\236\227\344\271\220\345\244\251/main.mu" "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.mu"
new file mode 100644
index 0000000..81e0b22
--- /dev/null
+++ "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.mu"
@@ -0,0 +1,575 @@
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <musa_runtime.h>
+#include <fcntl.h>
+#include <fstream>
+#include <iostream>
+#include <omp.h>
+#include <parallel/algorithm>
+#include <string>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <vector>
+
+#define MATCH_SCORE 2
+#define MISMATCH_SCORE -1
+#define GAP_PENALTY -1
+#define MAX_READ_LEN 256
+#define KMER_SIZE 16
+constexpr int LOOKUP_PREFIX_BITS = 22;
+constexpr uint32_t LOOKUP_BUCKETS = (1u << LOOKUP_PREFIX_BITS);
+constexpr int LOOKUP_SHIFT = 32 - LOOKUP_PREFIX_BITS;
+
+struct Read {
+    std::string name;
+    std::string seq;
+};
+
+// Modified to hold read kmers instead of reference kmers
+struct KmerPos {
+    uint32_t kmer;
+    int read_id;
+    int offset;
+    bool operator<(const KmerPos &other) const { return kmer < other.kmer; }
+};
+
+void load_reference(const std::string &filename, std::vector<std::string> &names, std::string &concat_seq,
+                    std::vector<size_t> &offsets) {
+    int fd = open(filename.c_str(), O_RDONLY);
+    if (fd == -1)
+        exit(1);
+
+    struct stat sb;
+    if (fstat(fd, &sb) == -1)
+        exit(1);
+    size_t size = sb.st_size;
+
+    const char *mapped_ptr = static_cast<const char *>(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0));
+    if (mapped_ptr == MAP_FAILED)
+        exit(1);
+    close(fd);
+
+    concat_seq.reserve(size);
+
+    const char *ptr = mapped_ptr;
+    const char *end = ptr + size;
+    bool is_fastq = (ptr < end && *ptr == '@');
+
+    while (ptr < end) {
+        if (is_fastq) {
+            const char *name_end = (const char *)memchr(ptr, '\n', end - ptr);
+            if (!name_end)
+                name_end = end;
+            std::string name(ptr + 1, name_end - (ptr + 1));
+            if (!name.empty() && name.back() == '\r')
+                name.pop_back();
+            ptr = name_end + (name_end < end ? 1 : 0);
+
+            if (ptr >= end)
+                break;
+            const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr);
+            if (!seq_end)
+                seq_end = end;
+            std::string seq(ptr, seq_end - ptr);
+            if (!seq.empty() && seq.back() == '\r')
+                seq.pop_back();
+            ptr = seq_end + (seq_end < end ? 1 : 0);
+
+            names.push_back(name);
+            offsets.push_back(concat_seq.size());
+            concat_seq += seq;
+
+            ptr = (const char *)memchr(ptr, '\n', end - ptr);
+            if (ptr)
+                ptr++;
+            else
+                break;
+            ptr = (const char *)memchr(ptr, '\n', end - ptr);
+            if (ptr)
+                ptr++;
+            else
+                break;
+        } else {
+            if (*ptr == '>') {
+                const char *name_end = (const char *)memchr(ptr, '\n', end - ptr);
+                if (!name_end)
+                    name_end = end;
+                std::string name(ptr + 1, name_end - (ptr + 1));
+                if (!name.empty() && name.back() == '\r')
+                    name.pop_back();
+                names.push_back(name);
+                offsets.push_back(concat_seq.size());
+                ptr = name_end + (name_end < end ? 1 : 0);
+            } else {
+                const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr);
+                if (!seq_end)
+                    seq_end = end;
+                size_t seq_len = seq_end - ptr;
+                if (seq_len > 0 && *(seq_end - 1) == '\r')
+                    seq_len--;
+                concat_seq.append(ptr, seq_len);
+                ptr = seq_end + (seq_end < end ? 1 : 0);
+            }
+        }
+    }
+    munmap((void *)mapped_ptr, size);
+}
+
+void load_reads(const std::string &filename, std::vector<Read> &reads) {
+    int fd = open(filename.c_str(), O_RDONLY);
+    if (fd == -1)
+        exit(1);
+
+    struct stat sb;
+    if (fstat(fd, &sb) == -1)
+        exit(1);
+    size_t size = sb.st_size;
+
+    const char *mapped_ptr = static_cast<const char *>(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0));
+    if (mapped_ptr == MAP_FAILED)
+        exit(1);
+    close(fd);
+
+    reads.reserve(size / 150);
+
+    const char *ptr = mapped_ptr;
+    const char *end = ptr + size;
+
+    while (ptr < end) {
+        if (*ptr != '@') {
+            ptr = (const char *)memchr(ptr, '\n', end - ptr);
+            if (ptr)
+                ptr++;
+            else
+                break;
+            continue;
+        }
+
+        Read r;
+        const char *name_end = (const char *)memchr(ptr, '\n', end - ptr);
+        if (!name_end)
+            name_end = end;
+        r.name.assign(ptr + 1, name_end - (ptr + 1));
+        if (!r.name.empty() && r.name.back() == '\r')
+            r.name.pop_back();
+        ptr = name_end + (name_end < end ? 1 : 0);
+
+        if (ptr >= end)
+            break;
+
+        const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr);
+        if (!seq_end)
+            seq_end = end;
+        r.seq.assign(ptr, seq_end - ptr);
+        if (!r.seq.empty() && r.seq.back() == '\r')
+            r.seq.pop_back();
+        ptr = seq_end + (seq_end < end ? 1 : 0);
+
+        ptr = (const char *)memchr(ptr, '\n', end - ptr);
+        if (ptr)
+            ptr++;
+        else
+            break;
+        ptr = (const char *)memchr(ptr, '\n', end - ptr);
+        if (ptr)
+            ptr++;
+        else
+            break;
+
+        reads.push_back(std::move(r));
+    }
+    munmap((void *)mapped_ptr, size);
+}
+
+inline uint8_t char2val_cpu(char c) { return (c >> 1) & 0x03; }
+
+inline uint32_t pack_16mer(const char *seq) {
+    uint32_t packed = 0;
+    for (int i = 0; i < 16; ++i) {
+        uint8_t val = char2val_cpu(seq[i]);
+        packed |= (val << (30 - 2 * i));
+    }
+    return packed;
+}
+
+void encode_sequence_cpu(const std::string &seq, uint32_t *buffer) {
+    size_t length = seq.length();
+    size_t encoded_size = (length + 15) / 16;
+
+    // 小任务串行，避免 OMP 开销
+    if (encoded_size < 4096) {
+        for (size_t i = 0; i < encoded_size; ++i) {
+            uint32_t packed = 0;
+            size_t start = i * 16;
+            for (int j = 0; j < 16; ++j) {
+                if (start + j < length) {
+                    uint8_t val = char2val_cpu(seq[start + j]);
+                    packed |= (uint32_t)(val << (30 - 2 * j));
+                }
+            }
+            buffer[i] = packed;
+        }
+        return;
+    }
+
+#pragma omp parallel for
+    for (size_t i = 0; i < encoded_size; ++i) {
+        uint32_t packed = 0;
+        size_t start = i * 16;
+        for (int j = 0; j < 16; ++j) {
+            if (start + j < length) {
+                uint8_t val = char2val_cpu(seq[start + j]);
+                packed |= (uint32_t)(val << (30 - 2 * j));
+            }
+        }
+        buffer[i] = packed;
+    }
+}
+
+__device__ inline uint8_t get_base(const uint32_t *packed_array, size_t index) {
+    size_t array_idx = index / 16;
+    int bit_pos = 30 - 2 * (index % 16);
+    return (packed_array[array_idx] >> bit_pos) & 0x03;
+}
+
+extern "C" __global__ void sw_extend_wavefront_kernel(const uint32_t *ref_packed, size_t ref_len,
+                                                      const uint32_t *reads_packed, const int *read_lengths,
+                                                      const int64_t *candidate_positions,
+                                                      const int *candidate_read_indices, int total_candidates,
+                                                      int *out_scores, int64_t *out_best_pos) {
+    int global_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+    int local_warp_id = threadIdx.x / 32;
+    int lane_id = threadIdx.x % 32;
+
+    if (global_warp_id >= total_candidates)
+        return;
+
+    extern __shared__ short shared_buf[];
+
+    int warps_per_block = blockDim.x / 32;
+    short *H_row = &shared_buf[local_warp_id * MAX_READ_LEN];
+    short *Start_row = &shared_buf[local_warp_id * MAX_READ_LEN + (MAX_READ_LEN * warps_per_block)];
+
+    int64_t ref_start = candidate_positions[global_warp_id];
+    int read_idx = candidate_read_indices[global_warp_id];
+    int read_len = read_lengths[read_idx];
+
+    const uint32_t *my_read_packed = reads_packed + read_idx * (MAX_READ_LEN / 16);
+
+    int ref_region_len = read_len + 100;
+    if (ref_start + ref_region_len > ref_len) {
+        ref_region_len = ref_len - ref_start;
+    }
+
+    int total_chunks = (read_len + 31) / 32;
+
+    for (int chunk = 0; chunk < total_chunks; ++chunk) {
+        int j = chunk * 32 + lane_id;
+        if (j < read_len) {
+            H_row[j] = 0;
+            Start_row[j] = -1;
+        }
+    }
+
+    __syncwarp();
+
+    short max_score = 0;
+    short best_ref_offset = 0;
+
+    short h_diag_reg[8] = {0};
+    short start_diag_reg[8] = {0};
+
+    short next_h_reg[8] = {0};
+    short next_start_reg[8] = {-1};
+
+    int total_steps = ref_region_len + read_len - 1;
+
+    for (int step = 0; step < total_steps; ++step) {
+
+        for (int chunk = 0; chunk < total_chunks; ++chunk) {
+            int j = chunk * 32 + lane_id;
+            int i = step - j;
+
+            if (i >= 0 && i < ref_region_len && j < read_len) {
+                int64_t current_ref_pos = ref_start + i;
+                uint8_t ref_base = get_base(ref_packed, current_ref_pos);
+                uint8_t read_base = get_base(my_read_packed, j);
+
+                short match = (ref_base == read_base) ? MATCH_SCORE : MISMATCH_SCORE;
+
+                short h_up = H_row[j];
+                short start_up = Start_row[j];
+
+                short h_left = (j > 0) ? H_row[j - 1] : 0;
+                short start_left = (j > 0) ? Start_row[j - 1] : -1;
+
+                short score_diag = h_diag_reg[chunk] + match;
+                short score_up = h_up + GAP_PENALTY;
+                short score_left = h_left + GAP_PENALTY;
+
+                short score = 0;
+                short start = i;
+
+                if (score_diag > 0 && score_diag >= score_up && score_diag >= score_left) {
+                    score = score_diag;
+                    start = (h_diag_reg[chunk] == 0) ? i : start_diag_reg[chunk];
+                } else if (score_up > 0 && score_up >= score_left) {
+                    score = score_up;
+                    start = (h_up == 0) ? i : start_up;
+                } else if (score_left > 0) {
+                    score = score_left;
+                    start = (h_left == 0) ? i : start_left;
+                }
+
+                // 【修复核心】将当前步的左侧值，作为下一步的对角线值缓存！
+                h_diag_reg[chunk] = h_left;
+                start_diag_reg[chunk] = start_left;
+
+                next_h_reg[chunk] = score;
+                next_start_reg[chunk] = start;
+
+                if (score > max_score) {
+                    max_score = score;
+                    best_ref_offset = start;
+                }
+            }
+        }
+
+        __syncwarp();
+
+        for (int chunk = 0; chunk < total_chunks; ++chunk) {
+            int j = chunk * 32 + lane_id;
+            int i = step - j;
+            if (i >= 0 && i < ref_region_len && j < read_len) {
+                H_row[j] = next_h_reg[chunk];
+                Start_row[j] = next_start_reg[chunk];
+            }
+        }
+
+        __syncwarp();
+    }
+
+    for (int offset = 16; offset > 0; offset /= 2) {
+        short shfl_score = __shfl_down_sync(0xffffffff, max_score, offset);
+        short shfl_offset = __shfl_down_sync(0xffffffff, best_ref_offset, offset);
+        if (shfl_score > max_score) {
+            max_score = shfl_score;
+            best_ref_offset = shfl_offset;
+        }
+    }
+
+    if (lane_id == 0) {
+        out_scores[global_warp_id] = max_score;
+        out_best_pos[global_warp_id] = ref_start + best_ref_offset;
+    }
+}
+
+int main() {
+        musaFree(0);
+
+    std::string ref_file = "reference.fasta";
+    std::string reads_file = "reads.fastq";
+    std::ofstream out_file("results.txt");
+
+    std::vector<std::string> ref_names;
+    std::vector<size_t> ref_offsets;
+    std::string concat_ref;
+
+        load_reference(ref_file, ref_names, concat_ref, ref_offsets);
+
+        concat_ref.append(100, 'N');
+
+    std::vector<Read> reads;
+        load_reads(reads_file, reads);
+
+    int num_reads = reads.size();
+
+    size_t ref_encoded_size = (concat_ref.length() + 15) / 16;
+    uint32_t *h_ref_packed = new uint32_t[ref_encoded_size];
+
+        encode_sequence_cpu(concat_ref, h_ref_packed);
+
+    uint32_t *d_ref_packed;
+        musaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t));
+    
+
+        musaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), musaMemcpyHostToDevice);
+    
+
+    std::vector<uint32_t> h_reads_packed(num_reads * (MAX_READ_LEN / 16), 0);
+    std::vector<int> h_read_lengths(num_reads, 0);
+    std::vector<KmerPos> kmer_list;
+    kmer_list.reserve(num_reads * 50);
+
+    for (int i = 0; i < num_reads; ++i) {
+        int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN);
+        h_read_lengths[i] = rlen;
+        encode_sequence_cpu(reads[i].seq, h_reads_packed.data() + i * (MAX_READ_LEN / 16));
+
+        for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) {
+            kmer_list.push_back({pack_16mer(reads[i].seq.c_str() + offset), i, offset});
+        }
+    }
+    __gnu_parallel::sort(kmer_list.begin(), kmer_list.end());
+
+    // 22-bit prefix lookup
+    std::vector<uint32_t> lookup(LOOKUP_BUCKETS + 1, (uint32_t)kmer_list.size());
+    uint32_t prefix = 0;
+    lookup[0] = 0;
+    for (size_t i = 0; i < kmer_list.size(); ++i) {
+        uint32_t p = kmer_list[i].kmer >> LOOKUP_SHIFT;
+        while (prefix < p)
+            lookup[++prefix] = (uint32_t)i;
+    }
+    while (prefix < LOOKUP_BUCKETS)
+        lookup[++prefix] = (uint32_t)kmer_list.size();
+
+    std::vector<std::vector<int64_t>> read_cands(num_reads);
+
+    // rolling k-mer scan on reference
+    if (concat_ref.length() >= (KMER_SIZE + 100)) {
+        const int64_t max_pos = (int64_t)concat_ref.length() - KMER_SIZE - 100; // inclusive
+
+#pragma omp parallel
+        {
+            std::vector<std::vector<int64_t>> local_cands(num_reads);
+
+            const int tid = omp_get_thread_num();
+            const int nth = omp_get_num_threads();
+
+            const int64_t total = max_pos + 1; // pos in [0, max_pos]
+            const int64_t chunk = (total + nth - 1) / nth;
+            const int64_t begin = tid * chunk;
+            const int64_t end = std::min<int64_t>(total, begin + chunk);
+
+            if (begin < end) {
+                uint32_t kmer = pack_16mer(concat_ref.data() + begin);
+
+                for (int64_t pos = begin; pos < end; ++pos) {
+                    uint32_t p = kmer >> LOOKUP_SHIFT;
+
+                    for (uint32_t i = lookup[p]; i < lookup[p + 1]; ++i) {
+                        if (kmer_list[i].kmer == kmer) {
+                            int64_t start = pos - kmer_list[i].offset - 50;
+                            if (start >= 0)
+                                local_cands[kmer_list[i].read_id].push_back(start);
+                        }
+                    }
+
+                    // roll to next 16-mer
+                    if (pos + 1 < end) {
+                        kmer = (kmer << 2) | (uint32_t)char2val_cpu(concat_ref[pos + KMER_SIZE]);
+                    }
+                }
+            }
+
+#pragma omp critical
+            {
+                for (int i = 0; i < num_reads; ++i) {
+                    read_cands[i].insert(read_cands[i].end(), local_cands[i].begin(), local_cands[i].end());
+                }
+            }
+        }
+    }
+
+    std::vector<int64_t> h_candidate_pos;
+    std::vector<int> h_candidate_read_idx;
+
+    for (int i = 0; i < num_reads; ++i) {
+        if (read_cands[i].empty())
+            continue;
+        std::sort(read_cands[i].begin(), read_cands[i].end());
+        int64_t last_added = -1000;
+        int count = 0;
+        for (int64_t pos : read_cands[i]) {
+            if (pos > last_added + 50) {
+                h_candidate_pos.push_back(pos);
+                h_candidate_read_idx.push_back(i);
+                last_added = pos;
+                if (++count >= 100)
+                    break;
+            }
+        }
+    }
+
+    int total_candidates = h_candidate_pos.size();
+    if (total_candidates == 0) {
+        for (const auto &r : reads)
+            out_file << r.name << " unknown_origin\n";
+        return 0;
+    }
+
+    uint32_t *d_reads_packed;
+    int *d_read_lengths, *d_candidate_read_idx, *d_out_scores;
+    int64_t *d_candidate_pos, *d_out_best_pos;
+
+    musaMalloc(&d_reads_packed, h_reads_packed.size() * sizeof(uint32_t));
+    musaMalloc(&d_read_lengths, num_reads * sizeof(int));
+    musaMalloc(&d_candidate_pos, total_candidates * sizeof(int64_t));
+    musaMalloc(&d_candidate_read_idx, total_candidates * sizeof(int));
+    musaMalloc(&d_out_scores, total_candidates * sizeof(int));
+    musaMalloc(&d_out_best_pos, total_candidates * sizeof(int64_t));
+
+    musaMemcpy(d_reads_packed, h_reads_packed.data(), h_reads_packed.size() * sizeof(uint32_t), musaMemcpyHostToDevice);
+    musaMemcpy(d_read_lengths, h_read_lengths.data(), num_reads * sizeof(int), musaMemcpyHostToDevice);
+    musaMemcpy(d_candidate_pos, h_candidate_pos.data(), total_candidates * sizeof(int64_t), musaMemcpyHostToDevice);
+    musaMemcpy(d_candidate_read_idx, h_candidate_read_idx.data(), total_candidates * sizeof(int),
+               musaMemcpyHostToDevice);
+
+    int blockSize = 128; // 4 Warps per Block
+    // Total threads needed = total_candidates * 32
+    int numBlocks = (total_candidates * 32 + blockSize - 1) / blockSize;
+
+    int warps_per_block = blockSize / 32;
+    // Shared memory: warps_per_block * (MAX_READ_LEN * 2 arrays) * sizeof(short)
+    size_t sharedMemSize = warps_per_block * MAX_READ_LEN * 2 * sizeof(short);
+
+    sw_extend_wavefront_kernel<<<numBlocks, blockSize, sharedMemSize>>>(
+        d_ref_packed, concat_ref.length() - 100, d_reads_packed, d_read_lengths, d_candidate_pos, d_candidate_read_idx,
+        total_candidates, d_out_scores, d_out_best_pos);
+
+    musaDeviceSynchronize();
+
+    std::vector<int> h_out_scores(total_candidates);
+    std::vector<int64_t> h_out_best_pos(total_candidates);
+    musaMemcpy(h_out_scores.data(), d_out_scores, total_candidates * sizeof(int), musaMemcpyDeviceToHost);
+    musaMemcpy(h_out_best_pos.data(), d_out_best_pos, total_candidates * sizeof(int64_t), musaMemcpyDeviceToHost);
+
+    std::vector<int> best_score_per_read(num_reads, -1);
+    std::vector<int64_t> best_pos_per_read(num_reads, -1);
+
+    for (int i = 0; i < total_candidates; ++i) {
+        int r_idx = h_candidate_read_idx[i];
+        if (h_out_scores[i] > best_score_per_read[r_idx]) {
+            best_score_per_read[r_idx] = h_out_scores[i];
+            best_pos_per_read[r_idx] = h_out_best_pos[i];
+        }
+    }
+
+    for (int i = 0; i < num_reads; ++i) {
+        if (best_score_per_read[i] > h_read_lengths[i] * 0.5) {
+            int64_t global_pos = best_pos_per_read[i];
+            auto it = std::upper_bound(ref_offsets.begin(), ref_offsets.end(), global_pos);
+            int ref_idx = std::distance(ref_offsets.begin(), it) - 1;
+            int64_t local_pos = global_pos - ref_offsets[ref_idx];
+
+            out_file << reads[i].name << " " << ref_names[ref_idx] << " " << local_pos << "\n";
+        } else {
+            out_file << reads[i].name << " unknown_origin\n";
+        }
+    }
+
+    musaFree(d_ref_packed);
+    musaFree(d_reads_packed);
+    musaFree(d_read_lengths);
+    musaFree(d_candidate_pos);
+    musaFree(d_candidate_read_idx);
+    musaFree(d_out_scores);
+    musaFree(d_out_best_pos);
+    delete[] h_ref_packed;
+
+    return 0;
+}