From eb25d3dfcbe942271643a677a77d4122227d0ed5 Mon Sep 17 00:00:00 2001 From: yingziyu-llt Date: Wed, 25 Feb 2026 22:54:26 +0800 Subject: [PATCH 1/8] update --- .gitignore | 4 + 05_seq_align/generate_benchmark.py | 90 ++++++++ 05_seq_align/main.cu | 342 +++++++++++++++++++++++++++++ 3 files changed, 436 insertions(+) create mode 100644 .gitignore create mode 100644 05_seq_align/generate_benchmark.py create mode 100644 05_seq_align/main.cu diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d746c6d --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.txt +*.fastq +*.fasta +main \ No newline at end of file diff --git a/05_seq_align/generate_benchmark.py b/05_seq_align/generate_benchmark.py new file mode 100644 index 0000000..47333e8 --- /dev/null +++ b/05_seq_align/generate_benchmark.py @@ -0,0 +1,90 @@ +import random +import os + +NUM_CHROMOSOMES = 2 +CHROM_LENGTH = 50000 +NUM_READS = 1000 +READ_LENGTH = 100 +MUTATION_RATE = 0.05 +UNKNOWN_RATE = 0.1 + +BASES = ['A', 'C', 'G', 'T'] +QUAL_HIGH = 'I' +QUAL_LOW = '!' + +def generate_fasta(filename): + chromosomes = {} + with open(filename, 'w') as f: + for i in range(1, NUM_CHROMOSOMES + 1): + chrom_name = f"chr{i}" + f.write(f">{chrom_name}\n") + + seq = [] + chunk_size = 10000 + for _ in range(0, CHROM_LENGTH, chunk_size): + chunk = ''.join(random.choices(BASES, k=min(chunk_size, CHROM_LENGTH - len(seq)))) + f.write(chunk + "\n") + seq.append(chunk) + + chromosomes[chrom_name] = ''.join(seq) + return chromosomes + +def mutate_sequence(seq): + mutated = list(seq) + quals = [QUAL_HIGH] * len(seq) + + # Track the actual start shift if deletions happen at the very beginning + actual_start_offset = 0 + + for i in range(len(mutated)): + if random.random() < MUTATION_RATE: + op = random.choice(['sub', 'ins', 'del']) + if op == 'sub': + mutated[i] = random.choice([b for b in BASES if b != mutated[i]]) + quals[i] = QUAL_LOW + elif op == 'ins': + mutated[i] = mutated[i] + random.choice(BASES) + quals[i] = QUAL_LOW + elif op == 'del': + mutated[i] = '' + quals[i] = '' + if i == 0: + actual_start_offset += 1 # If first base is deleted, the real start shifts + + return ''.join(mutated)[:READ_LENGTH], ''.join(quals)[:READ_LENGTH], actual_start_offset + +def generate_fastq_and_truth(chromosomes, fastq_file, truth_file): + chrom_names = list(chromosomes.keys()) + + with open(fastq_file, 'w') as fq, open(truth_file, 'w') as tr: + for i in range(1, NUM_READS + 1): + read_name = f"read{i}" + + if random.random() < UNKNOWN_RATE: + seq = ''.join(random.choices(BASES, k=READ_LENGTH)) + qual = QUAL_HIGH * READ_LENGTH + tr.write(f"{read_name} unknown_origin\n") + else: + chrom = random.choice(chrom_names) + start_pos = random.randint(0, CHROM_LENGTH - READ_LENGTH) + origin_seq = chromosomes[chrom][start_pos:start_pos + READ_LENGTH + 10] # Pad for deletions + + seq, qual, offset = mutate_sequence(origin_seq) + + while len(seq) < READ_LENGTH: + seq += random.choice(BASES) + qual += QUAL_HIGH + + final_pos = start_pos + offset + tr.write(f"{read_name} {chrom} {final_pos}\n") + + fq.write(f"@{read_name}\n{seq}\n+\n{qual}\n") + +if __name__ == "__main__": + print("Generating reference genome...") + chroms = generate_fasta("reference.fasta") + + print("Generating reads and ground truth...") + generate_fastq_and_truth(chroms, "reads.fastq", "truth.txt") + + print("Benchmark data generated successfully.") \ No newline at end of file diff --git a/05_seq_align/main.cu b/05_seq_align/main.cu new file mode 100644 index 0000000..fa875ca --- /dev/null +++ b/05_seq_align/main.cu @@ -0,0 +1,342 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MATCH_SCORE 2 +#define MISMATCH_SCORE -1 +#define GAP_PENALTY -1 +#define MAX_READ_LEN 256 +#define KMER_SIZE 16 + +struct Read { + std::string name; + std::string seq; +}; + +void load_reference(const std::string& filename, std::vector& names, std::string& concat_seq, std::vector& offsets) { + std::ifstream file(filename); + if (!file.is_open()) exit(1); + std::string line, name, seq; + bool is_fastq = false; + + if (std::getline(file, line)) { + if (line[0] == '@') is_fastq = true; + name = line.substr(1); + } + + while (std::getline(file, line)) { + if (is_fastq) { + seq = line; + std::getline(file, line); + std::getline(file, line); + names.push_back(name); + offsets.push_back(concat_seq.size()); + concat_seq += seq; + if (std::getline(file, line)) name = line.substr(1); + } else { + if (line[0] == '>') { + names.push_back(name); + offsets.push_back(concat_seq.size()); + concat_seq += seq; + name = line.substr(1); + seq = ""; + } else { + seq += line; + } + } + } + if (!is_fastq && !name.empty()) { + names.push_back(name); + offsets.push_back(concat_seq.size()); + concat_seq += seq; + } +} + +void load_reads(const std::string& filename, std::vector& reads) { + std::ifstream file(filename); + if (!file.is_open()) exit(1); + std::string line; + while (std::getline(file, line)) { + if (line.empty()) continue; + Read r; + r.name = line.substr(1); + std::getline(file, r.seq); + std::getline(file, line); + std::getline(file, line); + reads.push_back(r); + } +} + +inline uint8_t char2val_cpu(char c) { + return (c >> 1) & 0x03; +} + +inline uint32_t pack_16mer(const char* seq) { + uint32_t packed = 0; + for (int i = 0; i < 16; ++i) { + uint8_t val = char2val_cpu(seq[i]); + packed |= (val << (30 - 2 * i)); + } + return packed; +} + +void encode_sequence_cpu(const std::string& seq, uint32_t* buffer) { + size_t length = seq.length(); + size_t encoded_size = (length + 15) / 16; + + #pragma omp parallel for + for (size_t i = 0; i < encoded_size; ++i) { + uint32_t packed = 0; + size_t start = i * 16; + for (int j = 0; j < 16; ++j) { + if (start + j < length) { + uint8_t val = char2val_cpu(seq[start + j]); + packed |= (val << (30 - 2 * j)); + } + } + buffer[i] = packed; + } +} + +__device__ inline uint8_t get_base(const uint32_t* packed_array, size_t index) { + size_t array_idx = index / 16; + int bit_pos = 30 - 2 * (index % 16); + return (packed_array[array_idx] >> bit_pos) & 0x03; +} + +__global__ void sw_extend_batched_kernel( + const uint32_t* ref_packed, size_t ref_len, + const uint32_t* reads_packed, const int* read_lengths, + const int* candidate_positions, const int* candidate_read_indices, + int total_candidates, + int* out_scores, int* out_best_pos) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= total_candidates) return; + + int ref_start = candidate_positions[tid]; + int read_idx = candidate_read_indices[tid]; + int read_len = read_lengths[read_idx]; + + const uint32_t* my_read_packed = reads_packed + read_idx * (MAX_READ_LEN / 16); + + // Give a larger window (100 bp extra) to accommodate high indel rates + int ref_region_len = read_len + 100; + if (ref_start + ref_region_len > ref_len) { + ref_region_len = ref_len - ref_start; + } + + int H_row[MAX_READ_LEN]; + int Start_row[MAX_READ_LEN]; + for (int i = 0; i < read_len; ++i) { + H_row[i] = 0; + Start_row[i] = -1; + } + + int max_score = 0; + int best_ref_start = ref_start; + + for (int i = 0; i < ref_region_len; ++i) { + int current_ref_pos = ref_start + i; + uint8_t ref_base = get_base(ref_packed, current_ref_pos); + + int h_diag = 0; + int start_diag = current_ref_pos; + int h_left = 0; + int start_left = -1; + + for (int j = 0; j < read_len; ++j) { + uint8_t read_base = get_base(my_read_packed, j); + int match = (ref_base == read_base) ? MATCH_SCORE : MISMATCH_SCORE; + + int score_diag = h_diag + match; + int score_up = H_row[j] + GAP_PENALTY; + int score_left = h_left + GAP_PENALTY; + + int score = 0; + int start = current_ref_pos; + + if (score_diag > 0 && score_diag >= score_up && score_diag >= score_left) { + score = score_diag; + // If previous score was 0, alignment restarts HERE, update start pos + start = (h_diag == 0) ? current_ref_pos : start_diag; + } else if (score_up > 0 && score_up >= score_left) { + score = score_up; + start = (H_row[j] == 0) ? current_ref_pos : Start_row[j]; + } else if (score_left > 0) { + score = score_left; + start = (h_left == 0) ? current_ref_pos : start_left; + } + + h_diag = H_row[j]; + start_diag = Start_row[j]; + + H_row[j] = score; + Start_row[j] = start; + + h_left = score; + start_left = start; + + if (score > max_score) { + max_score = score; + best_ref_start = start; + } + } + } + + out_scores[tid] = max_score; + out_best_pos[tid] = best_ref_start; +} + +int main() { + std::string ref_file = "reference.fasta"; + std::string reads_file = "reads.fastq"; + std::ofstream out_file("results.txt"); + + std::vector ref_names; + std::vector ref_offsets; + std::string concat_ref; + + load_reference(ref_file, ref_names, concat_ref, ref_offsets); + + std::vector reads; + load_reads(reads_file, reads); + + std::unordered_map> kmer_index; + // CRITICAL FIX: Stride = 1 guarantees we index EVERY possible 16-mer in the reference + for (size_t i = 0; i + KMER_SIZE <= concat_ref.length(); i += 1) { + uint32_t kmer = pack_16mer(concat_ref.c_str() + i); + if (kmer_index[kmer].size() < 100) { + kmer_index[kmer].push_back(i); + } + } + + size_t ref_encoded_size = (concat_ref.length() + 15) / 16; + uint32_t* h_ref_packed = new uint32_t[ref_encoded_size]; + encode_sequence_cpu(concat_ref, h_ref_packed); + + uint32_t* d_ref_packed; + cudaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t)); + cudaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), cudaMemcpyHostToDevice); + + int num_reads = reads.size(); + std::vector h_reads_packed(num_reads * (MAX_READ_LEN / 16), 0); + std::vector h_read_lengths(num_reads, 0); + + std::vector h_candidate_pos; + std::vector h_candidate_read_idx; + + for (int i = 0; i < num_reads; ++i) { + int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN); + h_read_lengths[i] = rlen; + + uint32_t* read_ptr = h_reads_packed.data() + i * (MAX_READ_LEN / 16); + encode_sequence_cpu(reads[i].seq, read_ptr); + + std::vector cands; + // CRITICAL FIX: Stride = 2 guarantees dense seed checking against the reference + for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) { + uint32_t seed = pack_16mer(reads[i].seq.c_str() + offset); + auto it = kmer_index.find(seed); + if (it != kmer_index.end()) { + for (int pos : it->second) { + // Start looking slightly before the expected position + int ref_window_start = std::max(0, pos - offset - 50); + cands.push_back(ref_window_start); + } + } + } + + std::sort(cands.begin(), cands.end()); + int last_added = -1000; + for(int pos : cands) { + // De-duplicate overlapping windows + if (pos > last_added + 50) { + h_candidate_pos.push_back(pos); + h_candidate_read_idx.push_back(i); + last_added = pos; + } + } + } + + int total_candidates = h_candidate_pos.size(); + if (total_candidates == 0) { + for (const auto& r : reads) out_file << r.name << " unknown_origin\n"; + return 0; + } + + uint32_t* d_reads_packed; + int *d_read_lengths, *d_candidate_pos, *d_candidate_read_idx, *d_out_scores, *d_out_best_pos; + + cudaMalloc(&d_reads_packed, h_reads_packed.size() * sizeof(uint32_t)); + cudaMalloc(&d_read_lengths, num_reads * sizeof(int)); + cudaMalloc(&d_candidate_pos, total_candidates * sizeof(int)); + cudaMalloc(&d_candidate_read_idx, total_candidates * sizeof(int)); + cudaMalloc(&d_out_scores, total_candidates * sizeof(int)); + cudaMalloc(&d_out_best_pos, total_candidates * sizeof(int)); + + cudaMemcpy(d_reads_packed, h_reads_packed.data(), h_reads_packed.size() * sizeof(uint32_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_read_lengths, h_read_lengths.data(), num_reads * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_candidate_pos, h_candidate_pos.data(), total_candidates * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_candidate_read_idx, h_candidate_read_idx.data(), total_candidates * sizeof(int), cudaMemcpyHostToDevice); + + int blockSize = 256; + int numBlocks = (total_candidates + blockSize - 1) / blockSize; + + sw_extend_batched_kernel<<>>( + d_ref_packed, concat_ref.length(), + d_reads_packed, d_read_lengths, + d_candidate_pos, d_candidate_read_idx, + total_candidates, + d_out_scores, d_out_best_pos + ); + cudaDeviceSynchronize(); + + std::vector h_out_scores(total_candidates); + std::vector h_out_best_pos(total_candidates); + cudaMemcpy(h_out_scores.data(), d_out_scores, total_candidates * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(h_out_best_pos.data(), d_out_best_pos, total_candidates * sizeof(int), cudaMemcpyDeviceToHost); + + std::vector best_score_per_read(num_reads, -1); + std::vector best_pos_per_read(num_reads, -1); + + for (int i = 0; i < total_candidates; ++i) { + int r_idx = h_candidate_read_idx[i]; + if (h_out_scores[i] > best_score_per_read[r_idx]) { + best_score_per_read[r_idx] = h_out_scores[i]; + best_pos_per_read[r_idx] = h_out_best_pos[i]; + } + } + + for (int i = 0; i < num_reads; ++i) { + // Threshold: Need at least 50% length equivalent score (~25 perfect matches minimum) + if (best_score_per_read[i] > h_read_lengths[i] * 0.5) { + int global_pos = best_pos_per_read[i]; + auto it = std::upper_bound(ref_offsets.begin(), ref_offsets.end(), global_pos); + int ref_idx = std::distance(ref_offsets.begin(), it) - 1; + int local_pos = global_pos - ref_offsets[ref_idx]; + + // Output format strictly matches ground truth for clean diff + out_file << reads[i].name << " " << ref_names[ref_idx] << " " << local_pos << "\n"; + } else { + out_file << reads[i].name << " unknown_origin\n"; + } + } + + cudaFree(d_ref_packed); + cudaFree(d_reads_packed); + cudaFree(d_read_lengths); + cudaFree(d_candidate_pos); + cudaFree(d_candidate_read_idx); + cudaFree(d_out_scores); + cudaFree(d_out_best_pos); + delete[] h_ref_packed; + + return 0; +} \ No newline at end of file From 709eb38471dd308e9d155d4a300cab6c02fb0ae0 Mon Sep 17 00:00:00 2001 From: yingziyu-llt Date: Mon, 2 Mar 2026 23:11:19 +0800 Subject: [PATCH 2/8] update: remove unordered_map --- 05_seq_align/main.cu | 189 ++++++++++++++++++++++--------------------- 1 file changed, 98 insertions(+), 91 deletions(-) diff --git a/05_seq_align/main.cu b/05_seq_align/main.cu index fa875ca..d9578dd 100644 --- a/05_seq_align/main.cu +++ b/05_seq_align/main.cu @@ -1,12 +1,11 @@ -#include -#include -#include +#include #include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include #define MATCH_SCORE 2 #define MISMATCH_SCORE -1 @@ -19,26 +18,39 @@ struct Read { std::string seq; }; -void load_reference(const std::string& filename, std::vector& names, std::string& concat_seq, std::vector& offsets) { +struct KmerPos { + uint32_t kmer; + int pos; + bool operator<(const KmerPos& other) const { + if (kmer != other.kmer) return kmer < other.kmer; + return pos < other.pos; + } +}; + +void load_reference(const std::string &filename, std::vector &names, std::string &concat_seq, + std::vector &offsets) { std::ifstream file(filename); - if (!file.is_open()) exit(1); + if (!file.is_open()) + exit(1); std::string line, name, seq; bool is_fastq = false; if (std::getline(file, line)) { - if (line[0] == '@') is_fastq = true; + if (line[0] == '@') + is_fastq = true; name = line.substr(1); } while (std::getline(file, line)) { if (is_fastq) { seq = line; - std::getline(file, line); - std::getline(file, line); + std::getline(file, line); + std::getline(file, line); names.push_back(name); offsets.push_back(concat_seq.size()); concat_seq += seq; - if (std::getline(file, line)) name = line.substr(1); + if (std::getline(file, line)) + name = line.substr(1); } else { if (line[0] == '>') { names.push_back(name); @@ -58,26 +70,26 @@ void load_reference(const std::string& filename, std::vector& names } } -void load_reads(const std::string& filename, std::vector& reads) { +void load_reads(const std::string &filename, std::vector &reads) { std::ifstream file(filename); - if (!file.is_open()) exit(1); + if (!file.is_open()) + exit(1); std::string line; while (std::getline(file, line)) { - if (line.empty()) continue; + if (line.empty()) + continue; Read r; r.name = line.substr(1); std::getline(file, r.seq); - std::getline(file, line); - std::getline(file, line); + std::getline(file, line); + std::getline(file, line); reads.push_back(r); } } -inline uint8_t char2val_cpu(char c) { - return (c >> 1) & 0x03; -} +inline uint8_t char2val_cpu(char c) { return (c >> 1) & 0x03; } -inline uint32_t pack_16mer(const char* seq) { +inline uint32_t pack_16mer(const char *seq) { uint32_t packed = 0; for (int i = 0; i < 16; ++i) { uint8_t val = char2val_cpu(seq[i]); @@ -86,48 +98,44 @@ inline uint32_t pack_16mer(const char* seq) { return packed; } -void encode_sequence_cpu(const std::string& seq, uint32_t* buffer) { +void encode_sequence_cpu(const std::string &seq, uint32_t *buffer) { size_t length = seq.length(); size_t encoded_size = (length + 15) / 16; - - #pragma omp parallel for +#pragma omp parallel for for (size_t i = 0; i < encoded_size; ++i) { uint32_t packed = 0; size_t start = i * 16; for (int j = 0; j < 16; ++j) { if (start + j < length) { uint8_t val = char2val_cpu(seq[start + j]); - packed |= (val << (30 - 2 * j)); + packed |= (uint32_t)(val << (30 - 2 * j)); } } buffer[i] = packed; } } -__device__ inline uint8_t get_base(const uint32_t* packed_array, size_t index) { +__device__ inline uint8_t get_base(const uint32_t *packed_array, size_t index) { size_t array_idx = index / 16; int bit_pos = 30 - 2 * (index % 16); return (packed_array[array_idx] >> bit_pos) & 0x03; } -__global__ void sw_extend_batched_kernel( - const uint32_t* ref_packed, size_t ref_len, - const uint32_t* reads_packed, const int* read_lengths, - const int* candidate_positions, const int* candidate_read_indices, - int total_candidates, - int* out_scores, int* out_best_pos) -{ +__global__ void sw_extend_batched_kernel(const uint32_t *ref_packed, size_t ref_len, const uint32_t *reads_packed, + const int *read_lengths, const int *candidate_positions, + const int *candidate_read_indices, int total_candidates, int *out_scores, + int *out_best_pos) { int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid >= total_candidates) return; + if (tid >= total_candidates) + return; int ref_start = candidate_positions[tid]; int read_idx = candidate_read_indices[tid]; int read_len = read_lengths[read_idx]; - - const uint32_t* my_read_packed = reads_packed + read_idx * (MAX_READ_LEN / 16); - // Give a larger window (100 bp extra) to accommodate high indel rates - int ref_region_len = read_len + 100; + const uint32_t *my_read_packed = reads_packed + read_idx * (MAX_READ_LEN / 16); + + int ref_region_len = read_len + 100; if (ref_start + ref_region_len > ref_len) { ref_region_len = ref_len - ref_start; } @@ -145,27 +153,26 @@ __global__ void sw_extend_batched_kernel( for (int i = 0; i < ref_region_len; ++i) { int current_ref_pos = ref_start + i; uint8_t ref_base = get_base(ref_packed, current_ref_pos); - - int h_diag = 0; - int start_diag = current_ref_pos; + + int h_diag = 0; + int start_diag = current_ref_pos; int h_left = 0; int start_left = -1; for (int j = 0; j < read_len; ++j) { uint8_t read_base = get_base(my_read_packed, j); int match = (ref_base == read_base) ? MATCH_SCORE : MISMATCH_SCORE; - + int score_diag = h_diag + match; int score_up = H_row[j] + GAP_PENALTY; int score_left = h_left + GAP_PENALTY; - + int score = 0; int start = current_ref_pos; - + if (score_diag > 0 && score_diag >= score_up && score_diag >= score_left) { score = score_diag; - // If previous score was 0, alignment restarts HERE, update start pos - start = (h_diag == 0) ? current_ref_pos : start_diag; + start = (h_diag == 0) ? current_ref_pos : start_diag; } else if (score_up > 0 && score_up >= score_left) { score = score_up; start = (H_row[j] == 0) ? current_ref_pos : Start_row[j]; @@ -176,11 +183,11 @@ __global__ void sw_extend_batched_kernel( h_diag = H_row[j]; start_diag = Start_row[j]; - - H_row[j] = score; + + H_row[j] = score; Start_row[j] = start; - - h_left = score; + + h_left = score; start_left = start; if (score > max_score) { @@ -191,7 +198,7 @@ __global__ void sw_extend_batched_kernel( } out_scores[tid] = max_score; - out_best_pos[tid] = best_ref_start; + out_best_pos[tid] = best_ref_start; } int main() { @@ -204,60 +211,64 @@ int main() { std::string concat_ref; load_reference(ref_file, ref_names, concat_ref, ref_offsets); - + std::vector reads; load_reads(reads_file, reads); - std::unordered_map> kmer_index; - // CRITICAL FIX: Stride = 1 guarantees we index EVERY possible 16-mer in the reference - for (size_t i = 0; i + KMER_SIZE <= concat_ref.length(); i += 1) { - uint32_t kmer = pack_16mer(concat_ref.c_str() + i); - if (kmer_index[kmer].size() < 100) { - kmer_index[kmer].push_back(i); + // Build kmer index with array and sort + std::vector kmer_list; + if (concat_ref.length() >= KMER_SIZE) { + kmer_list.reserve(concat_ref.length() - KMER_SIZE + 1); + for (size_t i = 0; i + KMER_SIZE <= concat_ref.length(); i += 1) { + kmer_list.push_back({pack_16mer(concat_ref.c_str() + i), (int)i}); } + std::sort(kmer_list.begin(), kmer_list.end()); } size_t ref_encoded_size = (concat_ref.length() + 15) / 16; - uint32_t* h_ref_packed = new uint32_t[ref_encoded_size]; + uint32_t *h_ref_packed = new uint32_t[ref_encoded_size]; encode_sequence_cpu(concat_ref, h_ref_packed); - uint32_t* d_ref_packed; + uint32_t *d_ref_packed; cudaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t)); cudaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), cudaMemcpyHostToDevice); int num_reads = reads.size(); std::vector h_reads_packed(num_reads * (MAX_READ_LEN / 16), 0); std::vector h_read_lengths(num_reads, 0); - + std::vector h_candidate_pos; std::vector h_candidate_read_idx; for (int i = 0; i < num_reads; ++i) { int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN); h_read_lengths[i] = rlen; - - uint32_t* read_ptr = h_reads_packed.data() + i * (MAX_READ_LEN / 16); + + uint32_t *read_ptr = h_reads_packed.data() + i * (MAX_READ_LEN / 16); encode_sequence_cpu(reads[i].seq, read_ptr); std::vector cands; - // CRITICAL FIX: Stride = 2 guarantees dense seed checking against the reference + + // Find seeds using binary search for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) { uint32_t seed = pack_16mer(reads[i].seq.c_str() + offset); - auto it = kmer_index.find(seed); - if (it != kmer_index.end()) { - for (int pos : it->second) { - // Start looking slightly before the expected position - int ref_window_start = std::max(0, pos - offset - 50); - cands.push_back(ref_window_start); - } + KmerPos target = {seed, 0}; + auto it = std::lower_bound(kmer_list.begin(), kmer_list.end(), target); + + int count = 0; + // Limit to max 100 hits per kmer to mimic previous behavior + while (it != kmer_list.end() && it->kmer == seed && count < 100) { + int ref_window_start = std::max(0, it->pos - offset - 50); + cands.push_back(ref_window_start); + ++it; + ++count; } } - + std::sort(cands.begin(), cands.end()); int last_added = -1000; - for(int pos : cands) { - // De-duplicate overlapping windows - if (pos > last_added + 50) { + for (int pos : cands) { + if (pos > last_added + 50) { h_candidate_pos.push_back(pos); h_candidate_read_idx.push_back(i); last_added = pos; @@ -267,11 +278,12 @@ int main() { int total_candidates = h_candidate_pos.size(); if (total_candidates == 0) { - for (const auto& r : reads) out_file << r.name << " unknown_origin\n"; + for (const auto &r : reads) + out_file << r.name << " unknown_origin\n"; return 0; } - uint32_t* d_reads_packed; + uint32_t *d_reads_packed; int *d_read_lengths, *d_candidate_pos, *d_candidate_read_idx, *d_out_scores, *d_out_best_pos; cudaMalloc(&d_reads_packed, h_reads_packed.size() * sizeof(uint32_t)); @@ -284,18 +296,15 @@ int main() { cudaMemcpy(d_reads_packed, h_reads_packed.data(), h_reads_packed.size() * sizeof(uint32_t), cudaMemcpyHostToDevice); cudaMemcpy(d_read_lengths, h_read_lengths.data(), num_reads * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_candidate_pos, h_candidate_pos.data(), total_candidates * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_candidate_read_idx, h_candidate_read_idx.data(), total_candidates * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_candidate_read_idx, h_candidate_read_idx.data(), total_candidates * sizeof(int), + cudaMemcpyHostToDevice); int blockSize = 256; int numBlocks = (total_candidates + blockSize - 1) / blockSize; - - sw_extend_batched_kernel<<>>( - d_ref_packed, concat_ref.length(), - d_reads_packed, d_read_lengths, - d_candidate_pos, d_candidate_read_idx, - total_candidates, - d_out_scores, d_out_best_pos - ); + + sw_extend_batched_kernel<<>>(d_ref_packed, concat_ref.length(), d_reads_packed, + d_read_lengths, d_candidate_pos, d_candidate_read_idx, + total_candidates, d_out_scores, d_out_best_pos); cudaDeviceSynchronize(); std::vector h_out_scores(total_candidates); @@ -315,14 +324,12 @@ int main() { } for (int i = 0; i < num_reads; ++i) { - // Threshold: Need at least 50% length equivalent score (~25 perfect matches minimum) - if (best_score_per_read[i] > h_read_lengths[i] * 0.5) { + if (best_score_per_read[i] > h_read_lengths[i] * 0.5) { int global_pos = best_pos_per_read[i]; auto it = std::upper_bound(ref_offsets.begin(), ref_offsets.end(), global_pos); int ref_idx = std::distance(ref_offsets.begin(), it) - 1; int local_pos = global_pos - ref_offsets[ref_idx]; - - // Output format strictly matches ground truth for clean diff + out_file << reads[i].name << " " << ref_names[ref_idx] << " " << local_pos << "\n"; } else { out_file << reads[i].name << " unknown_origin\n"; From e5e39e7b68093df942fd441d5f3de5c2243ac627 Mon Sep 17 00:00:00 2001 From: yingziyu-llt Date: Mon, 2 Mar 2026 23:12:07 +0800 Subject: [PATCH 3/8] update: better parallel --- 05_seq_align/main.cu | 74 +++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/05_seq_align/main.cu b/05_seq_align/main.cu index d9578dd..4f3a7ae 100644 --- a/05_seq_align/main.cu +++ b/05_seq_align/main.cu @@ -222,7 +222,7 @@ int main() { for (size_t i = 0; i + KMER_SIZE <= concat_ref.length(); i += 1) { kmer_list.push_back({pack_16mer(concat_ref.c_str() + i), (int)i}); } - std::sort(kmer_list.begin(), kmer_list.end()); + __gnu_parallel::sort(kmer_list.begin(), kmer_list.end()); } size_t ref_encoded_size = (concat_ref.length() + 15) / 16; @@ -240,40 +240,52 @@ int main() { std::vector h_candidate_pos; std::vector h_candidate_read_idx; - for (int i = 0; i < num_reads; ++i) { - int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN); - h_read_lengths[i] = rlen; - - uint32_t *read_ptr = h_reads_packed.data() + i * (MAX_READ_LEN / 16); - encode_sequence_cpu(reads[i].seq, read_ptr); - - std::vector cands; - - // Find seeds using binary search - for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) { - uint32_t seed = pack_16mer(reads[i].seq.c_str() + offset); - KmerPos target = {seed, 0}; - auto it = std::lower_bound(kmer_list.begin(), kmer_list.end(), target); - - int count = 0; - // Limit to max 100 hits per kmer to mimic previous behavior - while (it != kmer_list.end() && it->kmer == seed && count < 100) { - int ref_window_start = std::max(0, it->pos - offset - 50); - cands.push_back(ref_window_start); - ++it; - ++count; + // Parallelize seed matching + #pragma omp parallel + { + std::vector local_cands_pos; + std::vector local_cands_idx; + + #pragma omp for schedule(dynamic, 100) + for (int i = 0; i < num_reads; ++i) { + int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN); + h_read_lengths[i] = rlen; + + uint32_t *read_ptr = h_reads_packed.data() + i * (MAX_READ_LEN / 16); + encode_sequence_cpu(reads[i].seq, read_ptr); + + std::vector cands; + for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) { + uint32_t seed = pack_16mer(reads[i].seq.c_str() + offset); + KmerPos target = {seed, 0}; + auto it = std::lower_bound(kmer_list.begin(), kmer_list.end(), target); + + int count = 0; + while (it != kmer_list.end() && it->kmer == seed && count < 100) { + int ref_window_start = std::max(0, it->pos - offset - 50); + cands.push_back(ref_window_start); + ++it; + ++count; + } } - } - std::sort(cands.begin(), cands.end()); - int last_added = -1000; - for (int pos : cands) { - if (pos > last_added + 50) { - h_candidate_pos.push_back(pos); - h_candidate_read_idx.push_back(i); - last_added = pos; + std::sort(cands.begin(), cands.end()); + int last_added = -1000; + for (int pos : cands) { + if (pos > last_added + 50) { + local_cands_pos.push_back(pos); + local_cands_idx.push_back(i); + last_added = pos; + } } } + + // Merge thread-local results + #pragma omp critical + { + h_candidate_pos.insert(h_candidate_pos.end(), local_cands_pos.begin(), local_cands_pos.end()); + h_candidate_read_idx.insert(h_candidate_read_idx.end(), local_cands_idx.begin(), local_cands_idx.end()); + } } int total_candidates = h_candidate_pos.size(); From 5b6380f4fb1ddcdcbd89ea836f0528982acbc95e Mon Sep 17 00:00:00 2001 From: yingziyu-llt Date: Mon, 2 Mar 2026 23:22:08 +0800 Subject: [PATCH 4/8] update: better io --- 05_seq_align/main.cu | 138 +++++++++++++++++++++++++++++++------------ 1 file changed, 99 insertions(+), 39 deletions(-) diff --git a/05_seq_align/main.cu b/05_seq_align/main.cu index 4f3a7ae..960bce2 100644 --- a/05_seq_align/main.cu +++ b/05_seq_align/main.cu @@ -6,6 +6,12 @@ #include #include #include +#include // For memchr +#include +#include +#include +#include + #define MATCH_SCORE 2 #define MISMATCH_SCORE -1 @@ -29,64 +35,118 @@ struct KmerPos { void load_reference(const std::string &filename, std::vector &names, std::string &concat_seq, std::vector &offsets) { - std::ifstream file(filename); - if (!file.is_open()) - exit(1); - std::string line, name, seq; - bool is_fastq = false; - - if (std::getline(file, line)) { - if (line[0] == '@') - is_fastq = true; - name = line.substr(1); - } - - while (std::getline(file, line)) { + int fd = open(filename.c_str(), O_RDONLY); + if (fd == -1) exit(1); + + struct stat sb; + if (fstat(fd, &sb) == -1) exit(1); + size_t size = sb.st_size; + + const char* mapped_ptr = static_cast(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0)); + if (mapped_ptr == MAP_FAILED) exit(1); + close(fd); + + concat_seq.reserve(size); + + const char* ptr = mapped_ptr; + const char* end = ptr + size; + bool is_fastq = (ptr < end && *ptr == '@'); + + while (ptr < end) { if (is_fastq) { - seq = line; - std::getline(file, line); - std::getline(file, line); + const char* name_end = (const char*)memchr(ptr, '\n', end - ptr); + if (!name_end) name_end = end; + std::string name(ptr + 1, name_end - (ptr + 1)); + if (!name.empty() && name.back() == '\r') name.pop_back(); + ptr = name_end + (name_end < end ? 1 : 0); + + if (ptr >= end) break; + const char* seq_end = (const char*)memchr(ptr, '\n', end - ptr); + if (!seq_end) seq_end = end; + std::string seq(ptr, seq_end - ptr); + if (!seq.empty() && seq.back() == '\r') seq.pop_back(); + ptr = seq_end + (seq_end < end ? 1 : 0); + names.push_back(name); offsets.push_back(concat_seq.size()); concat_seq += seq; - if (std::getline(file, line)) - name = line.substr(1); + + ptr = (const char*)memchr(ptr, '\n', end - ptr); + if (ptr) ptr++; else break; + ptr = (const char*)memchr(ptr, '\n', end - ptr); + if (ptr) ptr++; else break; } else { - if (line[0] == '>') { + if (*ptr == '>') { + const char* name_end = (const char*)memchr(ptr, '\n', end - ptr); + if (!name_end) name_end = end; + std::string name(ptr + 1, name_end - (ptr + 1)); + if (!name.empty() && name.back() == '\r') name.pop_back(); names.push_back(name); offsets.push_back(concat_seq.size()); - concat_seq += seq; - name = line.substr(1); - seq = ""; + ptr = name_end + (name_end < end ? 1 : 0); } else { - seq += line; + const char* seq_end = (const char*)memchr(ptr, '\n', end - ptr); + if (!seq_end) seq_end = end; + size_t seq_len = seq_end - ptr; + if (seq_len > 0 && *(seq_end - 1) == '\r') seq_len--; + concat_seq.append(ptr, seq_len); + ptr = seq_end + (seq_end < end ? 1 : 0); } } } - if (!is_fastq && !name.empty()) { - names.push_back(name); - offsets.push_back(concat_seq.size()); - concat_seq += seq; - } + munmap((void*)mapped_ptr, size); } void load_reads(const std::string &filename, std::vector &reads) { - std::ifstream file(filename); - if (!file.is_open()) - exit(1); - std::string line; - while (std::getline(file, line)) { - if (line.empty()) + int fd = open(filename.c_str(), O_RDONLY); + if (fd == -1) exit(1); + + struct stat sb; + if (fstat(fd, &sb) == -1) exit(1); + size_t size = sb.st_size; + + const char* mapped_ptr = static_cast(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0)); + if (mapped_ptr == MAP_FAILED) exit(1); + close(fd); + + reads.reserve(size / 150); + + const char* ptr = mapped_ptr; + const char* end = ptr + size; + + while (ptr < end) { + if (*ptr != '@') { + ptr = (const char*)memchr(ptr, '\n', end - ptr); + if (ptr) ptr++; else break; continue; + } + Read r; - r.name = line.substr(1); - std::getline(file, r.seq); - std::getline(file, line); - std::getline(file, line); - reads.push_back(r); + const char* name_end = (const char*)memchr(ptr, '\n', end - ptr); + if (!name_end) name_end = end; + r.name.assign(ptr + 1, name_end - (ptr + 1)); + if (!r.name.empty() && r.name.back() == '\r') r.name.pop_back(); + ptr = name_end + (name_end < end ? 1 : 0); + + if (ptr >= end) break; + + const char* seq_end = (const char*)memchr(ptr, '\n', end - ptr); + if (!seq_end) seq_end = end; + r.seq.assign(ptr, seq_end - ptr); + if (!r.seq.empty() && r.seq.back() == '\r') r.seq.pop_back(); + ptr = seq_end + (seq_end < end ? 1 : 0); + + ptr = (const char*)memchr(ptr, '\n', end - ptr); + if (ptr) ptr++; else break; + ptr = (const char*)memchr(ptr, '\n', end - ptr); + if (ptr) ptr++; else break; + + reads.push_back(std::move(r)); } + munmap((void*)mapped_ptr, size); } + inline uint8_t char2val_cpu(char c) { return (c >> 1) & 0x03; } inline uint32_t pack_16mer(const char *seq) { From f862f3e3bd48ad984c8a3c67fd9cc231cfa2c17b Mon Sep 17 00:00:00 2001 From: yingziyu-llt Date: Tue, 3 Mar 2026 17:37:58 +0800 Subject: [PATCH 5/8] better mp --- 05_seq_align/main.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/05_seq_align/main.cu b/05_seq_align/main.cu index 960bce2..a8502c7 100644 --- a/05_seq_align/main.cu +++ b/05_seq_align/main.cu @@ -262,6 +262,7 @@ __global__ void sw_extend_batched_kernel(const uint32_t *ref_packed, size_t ref_ } int main() { + cudaFree(0); std::string ref_file = "reference.fasta"; std::string reads_file = "reads.fastq"; std::ofstream out_file("results.txt"); @@ -329,7 +330,7 @@ int main() { } } - std::sort(cands.begin(), cands.end()); + __gnu_parallel::sort(cands.begin(), cands.end()); int last_added = -1000; for (int pos : cands) { if (pos > last_added + 50) { From 480ee0e6e9404444e112545d0c4ad3a997e60a17 Mon Sep 17 00:00:00 2001 From: yingziyu-llt Date: Thu, 12 Mar 2026 16:56:14 +0800 Subject: [PATCH 6/8] update: wave-form dp --- 05_seq_align/main.cu | 600 +++++++++++++++++++++++++++++-------------- 1 file changed, 410 insertions(+), 190 deletions(-) diff --git a/05_seq_align/main.cu b/05_seq_align/main.cu index a8502c7..6e933f6 100644 --- a/05_seq_align/main.cu +++ b/05_seq_align/main.cu @@ -1,152 +1,235 @@ -#include +#include #include +#include +#include #include +#include #include #include #include +#include #include -#include -#include // For memchr -#include #include #include #include +#include +#ifdef USE_NVTX +#include +#endif + +struct ScopedTimer { + const char *name; + std::chrono::high_resolution_clock::time_point t0; + ScopedTimer(const char *n) : name(n), t0(std::chrono::high_resolution_clock::now()) { +#ifdef USE_NVTX + nvtxRangePushA(name); +#endif + } + ~ScopedTimer() { + auto t1 = std::chrono::high_resolution_clock::now(); + double ms = std::chrono::duration(t1 - t0).count(); + std::fprintf(stderr, "[TIMER][CPU] %s: %.3f ms\n", name, ms); +#ifdef USE_NVTX + nvtxRangePop(); +#endif + } +}; +struct ScopedCudaEventTimer { + const char *name; + cudaEvent_t ev0{}, ev1{}; + ScopedCudaEventTimer(const char *n) : name(n) { +#ifdef USE_NVTX + nvtxRangePushA(name); +#endif + cudaEventCreate(&ev0); + cudaEventCreate(&ev1); + cudaEventRecord(ev0); + } + ~ScopedCudaEventTimer() { + cudaEventRecord(ev1); + cudaEventSynchronize(ev1); + float ms = 0.0f; + cudaEventElapsedTime(&ms, ev0, ev1); + std::fprintf(stderr, "[TIMER][GPU] %s: %.3f ms\n", name, ms); + cudaEventDestroy(ev0); + cudaEventDestroy(ev1); +#ifdef USE_NVTX + nvtxRangePop(); +#endif + } +}; #define MATCH_SCORE 2 #define MISMATCH_SCORE -1 #define GAP_PENALTY -1 #define MAX_READ_LEN 256 #define KMER_SIZE 16 +constexpr int LOOKUP_PREFIX_BITS = 22; +constexpr uint32_t LOOKUP_BUCKETS = (1u << LOOKUP_PREFIX_BITS); +constexpr int LOOKUP_SHIFT = 32 - LOOKUP_PREFIX_BITS; struct Read { std::string name; std::string seq; }; +// Modified to hold read kmers instead of reference kmers struct KmerPos { uint32_t kmer; - int pos; - bool operator<(const KmerPos& other) const { - if (kmer != other.kmer) return kmer < other.kmer; - return pos < other.pos; - } + int read_id; + int offset; + bool operator<(const KmerPos &other) const { return kmer < other.kmer; } }; void load_reference(const std::string &filename, std::vector &names, std::string &concat_seq, std::vector &offsets) { int fd = open(filename.c_str(), O_RDONLY); - if (fd == -1) exit(1); - + if (fd == -1) + exit(1); + struct stat sb; - if (fstat(fd, &sb) == -1) exit(1); + if (fstat(fd, &sb) == -1) + exit(1); size_t size = sb.st_size; - - const char* mapped_ptr = static_cast(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0)); - if (mapped_ptr == MAP_FAILED) exit(1); + + const char *mapped_ptr = static_cast(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0)); + if (mapped_ptr == MAP_FAILED) + exit(1); close(fd); - concat_seq.reserve(size); - - const char* ptr = mapped_ptr; - const char* end = ptr + size; + concat_seq.reserve(size); + + const char *ptr = mapped_ptr; + const char *end = ptr + size; bool is_fastq = (ptr < end && *ptr == '@'); while (ptr < end) { if (is_fastq) { - const char* name_end = (const char*)memchr(ptr, '\n', end - ptr); - if (!name_end) name_end = end; + const char *name_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!name_end) + name_end = end; std::string name(ptr + 1, name_end - (ptr + 1)); - if (!name.empty() && name.back() == '\r') name.pop_back(); + if (!name.empty() && name.back() == '\r') + name.pop_back(); ptr = name_end + (name_end < end ? 1 : 0); - if (ptr >= end) break; - const char* seq_end = (const char*)memchr(ptr, '\n', end - ptr); - if (!seq_end) seq_end = end; + if (ptr >= end) + break; + const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!seq_end) + seq_end = end; std::string seq(ptr, seq_end - ptr); - if (!seq.empty() && seq.back() == '\r') seq.pop_back(); + if (!seq.empty() && seq.back() == '\r') + seq.pop_back(); ptr = seq_end + (seq_end < end ? 1 : 0); names.push_back(name); offsets.push_back(concat_seq.size()); concat_seq += seq; - ptr = (const char*)memchr(ptr, '\n', end - ptr); - if (ptr) ptr++; else break; - ptr = (const char*)memchr(ptr, '\n', end - ptr); - if (ptr) ptr++; else break; + ptr = (const char *)memchr(ptr, '\n', end - ptr); + if (ptr) + ptr++; + else + break; + ptr = (const char *)memchr(ptr, '\n', end - ptr); + if (ptr) + ptr++; + else + break; } else { if (*ptr == '>') { - const char* name_end = (const char*)memchr(ptr, '\n', end - ptr); - if (!name_end) name_end = end; + const char *name_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!name_end) + name_end = end; std::string name(ptr + 1, name_end - (ptr + 1)); - if (!name.empty() && name.back() == '\r') name.pop_back(); + if (!name.empty() && name.back() == '\r') + name.pop_back(); names.push_back(name); offsets.push_back(concat_seq.size()); ptr = name_end + (name_end < end ? 1 : 0); } else { - const char* seq_end = (const char*)memchr(ptr, '\n', end - ptr); - if (!seq_end) seq_end = end; + const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!seq_end) + seq_end = end; size_t seq_len = seq_end - ptr; - if (seq_len > 0 && *(seq_end - 1) == '\r') seq_len--; + if (seq_len > 0 && *(seq_end - 1) == '\r') + seq_len--; concat_seq.append(ptr, seq_len); ptr = seq_end + (seq_end < end ? 1 : 0); } } } - munmap((void*)mapped_ptr, size); + munmap((void *)mapped_ptr, size); } void load_reads(const std::string &filename, std::vector &reads) { int fd = open(filename.c_str(), O_RDONLY); - if (fd == -1) exit(1); - + if (fd == -1) + exit(1); + struct stat sb; - if (fstat(fd, &sb) == -1) exit(1); + if (fstat(fd, &sb) == -1) + exit(1); size_t size = sb.st_size; - - const char* mapped_ptr = static_cast(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0)); - if (mapped_ptr == MAP_FAILED) exit(1); + + const char *mapped_ptr = static_cast(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0)); + if (mapped_ptr == MAP_FAILED) + exit(1); close(fd); - reads.reserve(size / 150); + reads.reserve(size / 150); - const char* ptr = mapped_ptr; - const char* end = ptr + size; + const char *ptr = mapped_ptr; + const char *end = ptr + size; while (ptr < end) { if (*ptr != '@') { - ptr = (const char*)memchr(ptr, '\n', end - ptr); - if (ptr) ptr++; else break; + ptr = (const char *)memchr(ptr, '\n', end - ptr); + if (ptr) + ptr++; + else + break; continue; } Read r; - const char* name_end = (const char*)memchr(ptr, '\n', end - ptr); - if (!name_end) name_end = end; + const char *name_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!name_end) + name_end = end; r.name.assign(ptr + 1, name_end - (ptr + 1)); - if (!r.name.empty() && r.name.back() == '\r') r.name.pop_back(); + if (!r.name.empty() && r.name.back() == '\r') + r.name.pop_back(); ptr = name_end + (name_end < end ? 1 : 0); - - if (ptr >= end) break; - const char* seq_end = (const char*)memchr(ptr, '\n', end - ptr); - if (!seq_end) seq_end = end; + if (ptr >= end) + break; + + const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!seq_end) + seq_end = end; r.seq.assign(ptr, seq_end - ptr); - if (!r.seq.empty() && r.seq.back() == '\r') r.seq.pop_back(); + if (!r.seq.empty() && r.seq.back() == '\r') + r.seq.pop_back(); ptr = seq_end + (seq_end < end ? 1 : 0); - ptr = (const char*)memchr(ptr, '\n', end - ptr); - if (ptr) ptr++; else break; - ptr = (const char*)memchr(ptr, '\n', end - ptr); - if (ptr) ptr++; else break; + ptr = (const char *)memchr(ptr, '\n', end - ptr); + if (ptr) + ptr++; + else + break; + ptr = (const char *)memchr(ptr, '\n', end - ptr); + if (ptr) + ptr++; + else + break; reads.push_back(std::move(r)); } - munmap((void*)mapped_ptr, size); + munmap((void *)mapped_ptr, size); } - inline uint8_t char2val_cpu(char c) { return (c >> 1) & 0x03; } inline uint32_t pack_16mer(const char *seq) { @@ -161,6 +244,23 @@ inline uint32_t pack_16mer(const char *seq) { void encode_sequence_cpu(const std::string &seq, uint32_t *buffer) { size_t length = seq.length(); size_t encoded_size = (length + 15) / 16; + + // 小任务串行,避免 OMP 开销 + if (encoded_size < 4096) { + for (size_t i = 0; i < encoded_size; ++i) { + uint32_t packed = 0; + size_t start = i * 16; + for (int j = 0; j < 16; ++j) { + if (start + j < length) { + uint8_t val = char2val_cpu(seq[start + j]); + packed |= (uint32_t)(val << (30 - 2 * j)); + } + } + buffer[i] = packed; + } + return; + } + #pragma omp parallel for for (size_t i = 0; i < encoded_size; ++i) { uint32_t packed = 0; @@ -181,16 +281,26 @@ __device__ inline uint8_t get_base(const uint32_t *packed_array, size_t index) { return (packed_array[array_idx] >> bit_pos) & 0x03; } -__global__ void sw_extend_batched_kernel(const uint32_t *ref_packed, size_t ref_len, const uint32_t *reads_packed, - const int *read_lengths, const int *candidate_positions, - const int *candidate_read_indices, int total_candidates, int *out_scores, - int *out_best_pos) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid >= total_candidates) +extern "C" __global__ void sw_extend_wavefront_kernel(const uint32_t *ref_packed, size_t ref_len, + const uint32_t *reads_packed, const int *read_lengths, + const int64_t *candidate_positions, + const int *candidate_read_indices, int total_candidates, + int *out_scores, int64_t *out_best_pos) { + int global_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32; + int local_warp_id = threadIdx.x / 32; + int lane_id = threadIdx.x % 32; + + if (global_warp_id >= total_candidates) return; - int ref_start = candidate_positions[tid]; - int read_idx = candidate_read_indices[tid]; + extern __shared__ short shared_buf[]; + + int warps_per_block = blockDim.x / 32; + short *H_row = &shared_buf[local_warp_id * MAX_READ_LEN]; + short *Start_row = &shared_buf[local_warp_id * MAX_READ_LEN + (MAX_READ_LEN * warps_per_block)]; + + int64_t ref_start = candidate_positions[global_warp_id]; + int read_idx = candidate_read_indices[global_warp_id]; int read_len = read_lengths[read_idx]; const uint32_t *my_read_packed = reads_packed + read_idx * (MAX_READ_LEN / 16); @@ -200,69 +310,117 @@ __global__ void sw_extend_batched_kernel(const uint32_t *ref_packed, size_t ref_ ref_region_len = ref_len - ref_start; } - int H_row[MAX_READ_LEN]; - int Start_row[MAX_READ_LEN]; - for (int i = 0; i < read_len; ++i) { - H_row[i] = 0; - Start_row[i] = -1; + int total_chunks = (read_len + 31) / 32; + + for (int chunk = 0; chunk < total_chunks; ++chunk) { + int j = chunk * 32 + lane_id; + if (j < read_len) { + H_row[j] = 0; + Start_row[j] = -1; + } } - int max_score = 0; - int best_ref_start = ref_start; - - for (int i = 0; i < ref_region_len; ++i) { - int current_ref_pos = ref_start + i; - uint8_t ref_base = get_base(ref_packed, current_ref_pos); - - int h_diag = 0; - int start_diag = current_ref_pos; - int h_left = 0; - int start_left = -1; - - for (int j = 0; j < read_len; ++j) { - uint8_t read_base = get_base(my_read_packed, j); - int match = (ref_base == read_base) ? MATCH_SCORE : MISMATCH_SCORE; - - int score_diag = h_diag + match; - int score_up = H_row[j] + GAP_PENALTY; - int score_left = h_left + GAP_PENALTY; - - int score = 0; - int start = current_ref_pos; - - if (score_diag > 0 && score_diag >= score_up && score_diag >= score_left) { - score = score_diag; - start = (h_diag == 0) ? current_ref_pos : start_diag; - } else if (score_up > 0 && score_up >= score_left) { - score = score_up; - start = (H_row[j] == 0) ? current_ref_pos : Start_row[j]; - } else if (score_left > 0) { - score = score_left; - start = (h_left == 0) ? current_ref_pos : start_left; - } + __syncwarp(); + + short max_score = 0; + short best_ref_offset = 0; + + short h_diag_reg[8] = {0}; + short start_diag_reg[8] = {0}; + + short next_h_reg[8] = {0}; + short next_start_reg[8] = {-1}; + + int total_steps = ref_region_len + read_len - 1; + + for (int step = 0; step < total_steps; ++step) { + + for (int chunk = 0; chunk < total_chunks; ++chunk) { + int j = chunk * 32 + lane_id; + int i = step - j; - h_diag = H_row[j]; - start_diag = Start_row[j]; + if (i >= 0 && i < ref_region_len && j < read_len) { + int64_t current_ref_pos = ref_start + i; + uint8_t ref_base = get_base(ref_packed, current_ref_pos); + uint8_t read_base = get_base(my_read_packed, j); - H_row[j] = score; - Start_row[j] = start; + short match = (ref_base == read_base) ? MATCH_SCORE : MISMATCH_SCORE; + + short h_up = H_row[j]; + short start_up = Start_row[j]; + + short h_left = (j > 0) ? H_row[j - 1] : 0; + short start_left = (j > 0) ? Start_row[j - 1] : -1; + + short score_diag = h_diag_reg[chunk] + match; + short score_up = h_up + GAP_PENALTY; + short score_left = h_left + GAP_PENALTY; + + short score = 0; + short start = i; + + if (score_diag > 0 && score_diag >= score_up && score_diag >= score_left) { + score = score_diag; + start = (h_diag_reg[chunk] == 0) ? i : start_diag_reg[chunk]; + } else if (score_up > 0 && score_up >= score_left) { + score = score_up; + start = (h_up == 0) ? i : start_up; + } else if (score_left > 0) { + score = score_left; + start = (h_left == 0) ? i : start_left; + } + + // 【修复核心】将当前步的左侧值,作为下一步的对角线值缓存! + h_diag_reg[chunk] = h_left; + start_diag_reg[chunk] = start_left; + + next_h_reg[chunk] = score; + next_start_reg[chunk] = start; + + if (score > max_score) { + max_score = score; + best_ref_offset = start; + } + } + } - h_left = score; - start_left = start; + __syncwarp(); - if (score > max_score) { - max_score = score; - best_ref_start = start; + for (int chunk = 0; chunk < total_chunks; ++chunk) { + int j = chunk * 32 + lane_id; + int i = step - j; + if (i >= 0 && i < ref_region_len && j < read_len) { + H_row[j] = next_h_reg[chunk]; + Start_row[j] = next_start_reg[chunk]; } } + + __syncwarp(); + } + + for (int offset = 16; offset > 0; offset /= 2) { + short shfl_score = __shfl_down_sync(0xffffffff, max_score, offset); + short shfl_offset = __shfl_down_sync(0xffffffff, best_ref_offset, offset); + if (shfl_score > max_score) { + max_score = shfl_score; + best_ref_offset = shfl_offset; + } } - out_scores[tid] = max_score; - out_best_pos[tid] = best_ref_start; + if (lane_id == 0) { + out_scores[global_warp_id] = max_score; + out_best_pos[global_warp_id] = ref_start + best_ref_offset; + } } int main() { - cudaFree(0); + ScopedTimer t_all("main_total"); + + { + ScopedTimer t("cuda_warmup"); + cudaFree(0); + } + std::string ref_file = "reference.fasta"; std::string reads_file = "reads.fastq"; std::ofstream out_file("results.txt"); @@ -271,81 +429,136 @@ int main() { std::vector ref_offsets; std::string concat_ref; - load_reference(ref_file, ref_names, concat_ref, ref_offsets); + { + ScopedTimer t("load_reference"); + load_reference(ref_file, ref_names, concat_ref, ref_offsets); + } - std::vector reads; - load_reads(reads_file, reads); + { + ScopedTimer t("append_ref_padding"); + concat_ref.append(100, 'N'); + } - // Build kmer index with array and sort - std::vector kmer_list; - if (concat_ref.length() >= KMER_SIZE) { - kmer_list.reserve(concat_ref.length() - KMER_SIZE + 1); - for (size_t i = 0; i + KMER_SIZE <= concat_ref.length(); i += 1) { - kmer_list.push_back({pack_16mer(concat_ref.c_str() + i), (int)i}); - } - __gnu_parallel::sort(kmer_list.begin(), kmer_list.end()); + std::vector reads; + { + ScopedTimer t("load_reads"); + load_reads(reads_file, reads); } + int num_reads = reads.size(); + size_t ref_encoded_size = (concat_ref.length() + 15) / 16; uint32_t *h_ref_packed = new uint32_t[ref_encoded_size]; - encode_sequence_cpu(concat_ref, h_ref_packed); + + { + ScopedTimer t("encode_reference_cpu"); + encode_sequence_cpu(concat_ref, h_ref_packed); + } uint32_t *d_ref_packed; - cudaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t)); - cudaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), cudaMemcpyHostToDevice); + { + ScopedCudaEventTimer t("cuda_malloc_ref"); + cudaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t)); + } + + { + ScopedCudaEventTimer t("h2d_ref_copy"); + cudaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), cudaMemcpyHostToDevice); + } - int num_reads = reads.size(); std::vector h_reads_packed(num_reads * (MAX_READ_LEN / 16), 0); std::vector h_read_lengths(num_reads, 0); + std::vector kmer_list; + kmer_list.reserve(num_reads * 50); - std::vector h_candidate_pos; - std::vector h_candidate_read_idx; + for (int i = 0; i < num_reads; ++i) { + int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN); + h_read_lengths[i] = rlen; + encode_sequence_cpu(reads[i].seq, h_reads_packed.data() + i * (MAX_READ_LEN / 16)); - // Parallelize seed matching - #pragma omp parallel - { - std::vector local_cands_pos; - std::vector local_cands_idx; - - #pragma omp for schedule(dynamic, 100) - for (int i = 0; i < num_reads; ++i) { - int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN); - h_read_lengths[i] = rlen; - - uint32_t *read_ptr = h_reads_packed.data() + i * (MAX_READ_LEN / 16); - encode_sequence_cpu(reads[i].seq, read_ptr); - - std::vector cands; - for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) { - uint32_t seed = pack_16mer(reads[i].seq.c_str() + offset); - KmerPos target = {seed, 0}; - auto it = std::lower_bound(kmer_list.begin(), kmer_list.end(), target); - - int count = 0; - while (it != kmer_list.end() && it->kmer == seed && count < 100) { - int ref_window_start = std::max(0, it->pos - offset - 50); - cands.push_back(ref_window_start); - ++it; - ++count; + for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) { + kmer_list.push_back({pack_16mer(reads[i].seq.c_str() + offset), i, offset}); + } + } + __gnu_parallel::sort(kmer_list.begin(), kmer_list.end()); + + // 22-bit prefix lookup + std::vector lookup(LOOKUP_BUCKETS + 1, (uint32_t)kmer_list.size()); + uint32_t prefix = 0; + lookup[0] = 0; + for (size_t i = 0; i < kmer_list.size(); ++i) { + uint32_t p = kmer_list[i].kmer >> LOOKUP_SHIFT; + while (prefix < p) + lookup[++prefix] = (uint32_t)i; + } + while (prefix < LOOKUP_BUCKETS) + lookup[++prefix] = (uint32_t)kmer_list.size(); + + std::vector> read_cands(num_reads); + + // rolling k-mer scan on reference + if (concat_ref.length() >= (KMER_SIZE + 100)) { + const int64_t max_pos = (int64_t)concat_ref.length() - KMER_SIZE - 100; // inclusive + +#pragma omp parallel + { + std::vector> local_cands(num_reads); + + const int tid = omp_get_thread_num(); + const int nth = omp_get_num_threads(); + + const int64_t total = max_pos + 1; // pos in [0, max_pos] + const int64_t chunk = (total + nth - 1) / nth; + const int64_t begin = tid * chunk; + const int64_t end = std::min(total, begin + chunk); + + if (begin < end) { + uint32_t kmer = pack_16mer(concat_ref.data() + begin); + + for (int64_t pos = begin; pos < end; ++pos) { + uint32_t p = kmer >> LOOKUP_SHIFT; + + for (uint32_t i = lookup[p]; i < lookup[p + 1]; ++i) { + if (kmer_list[i].kmer == kmer) { + int64_t start = pos - kmer_list[i].offset - 50; + if (start >= 0) + local_cands[kmer_list[i].read_id].push_back(start); + } + } + + // roll to next 16-mer + if (pos + 1 < end) { + kmer = (kmer << 2) | (uint32_t)char2val_cpu(concat_ref[pos + KMER_SIZE]); + } } } - __gnu_parallel::sort(cands.begin(), cands.end()); - int last_added = -1000; - for (int pos : cands) { - if (pos > last_added + 50) { - local_cands_pos.push_back(pos); - local_cands_idx.push_back(i); - last_added = pos; +#pragma omp critical + { + for (int i = 0; i < num_reads; ++i) { + read_cands[i].insert(read_cands[i].end(), local_cands[i].begin(), local_cands[i].end()); } } } + } - // Merge thread-local results - #pragma omp critical - { - h_candidate_pos.insert(h_candidate_pos.end(), local_cands_pos.begin(), local_cands_pos.end()); - h_candidate_read_idx.insert(h_candidate_read_idx.end(), local_cands_idx.begin(), local_cands_idx.end()); + std::vector h_candidate_pos; + std::vector h_candidate_read_idx; + + for (int i = 0; i < num_reads; ++i) { + if (read_cands[i].empty()) + continue; + std::sort(read_cands[i].begin(), read_cands[i].end()); + int64_t last_added = -1000; + int count = 0; + for (int64_t pos : read_cands[i]) { + if (pos > last_added + 50) { + h_candidate_pos.push_back(pos); + h_candidate_read_idx.push_back(i); + last_added = pos; + if (++count >= 100) + break; + } } } @@ -357,36 +570,43 @@ int main() { } uint32_t *d_reads_packed; - int *d_read_lengths, *d_candidate_pos, *d_candidate_read_idx, *d_out_scores, *d_out_best_pos; + int *d_read_lengths, *d_candidate_read_idx, *d_out_scores; + int64_t *d_candidate_pos, *d_out_best_pos; cudaMalloc(&d_reads_packed, h_reads_packed.size() * sizeof(uint32_t)); cudaMalloc(&d_read_lengths, num_reads * sizeof(int)); - cudaMalloc(&d_candidate_pos, total_candidates * sizeof(int)); + cudaMalloc(&d_candidate_pos, total_candidates * sizeof(int64_t)); cudaMalloc(&d_candidate_read_idx, total_candidates * sizeof(int)); cudaMalloc(&d_out_scores, total_candidates * sizeof(int)); - cudaMalloc(&d_out_best_pos, total_candidates * sizeof(int)); + cudaMalloc(&d_out_best_pos, total_candidates * sizeof(int64_t)); cudaMemcpy(d_reads_packed, h_reads_packed.data(), h_reads_packed.size() * sizeof(uint32_t), cudaMemcpyHostToDevice); cudaMemcpy(d_read_lengths, h_read_lengths.data(), num_reads * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_candidate_pos, h_candidate_pos.data(), total_candidates * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_candidate_pos, h_candidate_pos.data(), total_candidates * sizeof(int64_t), cudaMemcpyHostToDevice); cudaMemcpy(d_candidate_read_idx, h_candidate_read_idx.data(), total_candidates * sizeof(int), cudaMemcpyHostToDevice); - int blockSize = 256; - int numBlocks = (total_candidates + blockSize - 1) / blockSize; + int blockSize = 128; // 4 Warps per Block + // Total threads needed = total_candidates * 32 + int numBlocks = (total_candidates * 32 + blockSize - 1) / blockSize; + + int warps_per_block = blockSize / 32; + // Shared memory: warps_per_block * (MAX_READ_LEN * 2 arrays) * sizeof(short) + size_t sharedMemSize = warps_per_block * MAX_READ_LEN * 2 * sizeof(short); + + sw_extend_wavefront_kernel<<>>( + d_ref_packed, concat_ref.length() - 100, d_reads_packed, d_read_lengths, d_candidate_pos, d_candidate_read_idx, + total_candidates, d_out_scores, d_out_best_pos); - sw_extend_batched_kernel<<>>(d_ref_packed, concat_ref.length(), d_reads_packed, - d_read_lengths, d_candidate_pos, d_candidate_read_idx, - total_candidates, d_out_scores, d_out_best_pos); cudaDeviceSynchronize(); std::vector h_out_scores(total_candidates); - std::vector h_out_best_pos(total_candidates); + std::vector h_out_best_pos(total_candidates); cudaMemcpy(h_out_scores.data(), d_out_scores, total_candidates * sizeof(int), cudaMemcpyDeviceToHost); - cudaMemcpy(h_out_best_pos.data(), d_out_best_pos, total_candidates * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(h_out_best_pos.data(), d_out_best_pos, total_candidates * sizeof(int64_t), cudaMemcpyDeviceToHost); std::vector best_score_per_read(num_reads, -1); - std::vector best_pos_per_read(num_reads, -1); + std::vector best_pos_per_read(num_reads, -1); for (int i = 0; i < total_candidates; ++i) { int r_idx = h_candidate_read_idx[i]; @@ -398,10 +618,10 @@ int main() { for (int i = 0; i < num_reads; ++i) { if (best_score_per_read[i] > h_read_lengths[i] * 0.5) { - int global_pos = best_pos_per_read[i]; + int64_t global_pos = best_pos_per_read[i]; auto it = std::upper_bound(ref_offsets.begin(), ref_offsets.end(), global_pos); int ref_idx = std::distance(ref_offsets.begin(), it) - 1; - int local_pos = global_pos - ref_offsets[ref_idx]; + int64_t local_pos = global_pos - ref_offsets[ref_idx]; out_file << reads[i].name << " " << ref_names[ref_idx] << " " << local_pos << "\n"; } else { @@ -419,4 +639,4 @@ int main() { delete[] h_ref_packed; return 0; -} \ No newline at end of file +} From 5439885bbed77a43781cb0234ed9c0e2d81163ca Mon Sep 17 00:00:00 2001 From: yingziyu-llt Date: Sat, 14 Mar 2026 17:14:06 +0800 Subject: [PATCH 7/8] update all --- .../generate_benchmark.py" | 8 +-- .../main.cu" | 71 +------------------ 2 files changed, 6 insertions(+), 73 deletions(-) rename 05_seq_align/generate_benchmark.py => "05_seq_align/\346\236\227\344\271\220\345\244\251/generate_benchmark.py" (96%) rename 05_seq_align/main.cu => "05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu" (92%) diff --git a/05_seq_align/generate_benchmark.py "b/05_seq_align/\346\236\227\344\271\220\345\244\251/generate_benchmark.py" similarity index 96% rename from 05_seq_align/generate_benchmark.py rename to "05_seq_align/\346\236\227\344\271\220\345\244\251/generate_benchmark.py" index 47333e8..43a3687 100644 --- a/05_seq_align/generate_benchmark.py +++ "b/05_seq_align/\346\236\227\344\271\220\345\244\251/generate_benchmark.py" @@ -1,9 +1,9 @@ import random import os -NUM_CHROMOSOMES = 2 -CHROM_LENGTH = 50000 -NUM_READS = 1000 +NUM_CHROMOSOMES = 100 +CHROM_LENGTH = 100000000 +NUM_READS = 10000 READ_LENGTH = 100 MUTATION_RATE = 0.05 UNKNOWN_RATE = 0.1 @@ -87,4 +87,4 @@ def generate_fastq_and_truth(chromosomes, fastq_file, truth_file): print("Generating reads and ground truth...") generate_fastq_and_truth(chroms, "reads.fastq", "truth.txt") - print("Benchmark data generated successfully.") \ No newline at end of file + print("Benchmark data generated successfully.") diff --git a/05_seq_align/main.cu "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu" similarity index 92% rename from 05_seq_align/main.cu rename to "05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu" index 6e933f6..0327b9f 100644 --- a/05_seq_align/main.cu +++ "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu" @@ -13,52 +13,6 @@ #include #include #include -#ifdef USE_NVTX -#include -#endif - -struct ScopedTimer { - const char *name; - std::chrono::high_resolution_clock::time_point t0; - ScopedTimer(const char *n) : name(n), t0(std::chrono::high_resolution_clock::now()) { -#ifdef USE_NVTX - nvtxRangePushA(name); -#endif - } - ~ScopedTimer() { - auto t1 = std::chrono::high_resolution_clock::now(); - double ms = std::chrono::duration(t1 - t0).count(); - std::fprintf(stderr, "[TIMER][CPU] %s: %.3f ms\n", name, ms); -#ifdef USE_NVTX - nvtxRangePop(); -#endif - } -}; - -struct ScopedCudaEventTimer { - const char *name; - cudaEvent_t ev0{}, ev1{}; - ScopedCudaEventTimer(const char *n) : name(n) { -#ifdef USE_NVTX - nvtxRangePushA(name); -#endif - cudaEventCreate(&ev0); - cudaEventCreate(&ev1); - cudaEventRecord(ev0); - } - ~ScopedCudaEventTimer() { - cudaEventRecord(ev1); - cudaEventSynchronize(ev1); - float ms = 0.0f; - cudaEventElapsedTime(&ms, ev0, ev1); - std::fprintf(stderr, "[TIMER][GPU] %s: %.3f ms\n", name, ms); - cudaEventDestroy(ev0); - cudaEventDestroy(ev1); -#ifdef USE_NVTX - nvtxRangePop(); -#endif - } -}; #define MATCH_SCORE 2 #define MISMATCH_SCORE -1 @@ -414,12 +368,7 @@ extern "C" __global__ void sw_extend_wavefront_kernel(const uint32_t *ref_packed } int main() { - ScopedTimer t_all("main_total"); - - { - ScopedTimer t("cuda_warmup"); cudaFree(0); - } std::string ref_file = "reference.fasta"; std::string reads_file = "reads.fastq"; @@ -429,42 +378,26 @@ int main() { std::vector ref_offsets; std::string concat_ref; - { - ScopedTimer t("load_reference"); load_reference(ref_file, ref_names, concat_ref, ref_offsets); - } - { - ScopedTimer t("append_ref_padding"); concat_ref.append(100, 'N'); - } std::vector reads; - { - ScopedTimer t("load_reads"); load_reads(reads_file, reads); - } int num_reads = reads.size(); size_t ref_encoded_size = (concat_ref.length() + 15) / 16; uint32_t *h_ref_packed = new uint32_t[ref_encoded_size]; - { - ScopedTimer t("encode_reference_cpu"); encode_sequence_cpu(concat_ref, h_ref_packed); - } uint32_t *d_ref_packed; - { - ScopedCudaEventTimer t("cuda_malloc_ref"); cudaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t)); - } + - { - ScopedCudaEventTimer t("h2d_ref_copy"); cudaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), cudaMemcpyHostToDevice); - } + std::vector h_reads_packed(num_reads * (MAX_READ_LEN / 16), 0); std::vector h_read_lengths(num_reads, 0); From 5a77c7f9021773add7888be473d5ca44dd82a92f Mon Sep 17 00:00:00 2001 From: yingziyu-llt Date: Sat, 14 Mar 2026 17:22:32 +0800 Subject: [PATCH 8/8] finalize --- .../main.cu" | 16 +- .../main.maca" | 572 +++++++++++++++++ .../main.mu" | 575 ++++++++++++++++++ 3 files changed, 1154 insertions(+), 9 deletions(-) create mode 100644 "05_seq_align/\346\236\227\344\271\220\345\244\251/main.maca" create mode 100644 "05_seq_align/\346\236\227\344\271\220\345\244\251/main.mu" diff --git "a/05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu" "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu" index 0327b9f..2da1335 100644 --- "a/05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu" +++ "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.cu" @@ -368,7 +368,7 @@ extern "C" __global__ void sw_extend_wavefront_kernel(const uint32_t *ref_packed } int main() { - cudaFree(0); + cudaFree(0); std::string ref_file = "reference.fasta"; std::string reads_file = "reads.fastq"; @@ -378,26 +378,24 @@ int main() { std::vector ref_offsets; std::string concat_ref; - load_reference(ref_file, ref_names, concat_ref, ref_offsets); + load_reference(ref_file, ref_names, concat_ref, ref_offsets); - concat_ref.append(100, 'N'); + concat_ref.append(100, 'N'); std::vector reads; - load_reads(reads_file, reads); + load_reads(reads_file, reads); int num_reads = reads.size(); size_t ref_encoded_size = (concat_ref.length() + 15) / 16; uint32_t *h_ref_packed = new uint32_t[ref_encoded_size]; - encode_sequence_cpu(concat_ref, h_ref_packed); + encode_sequence_cpu(concat_ref, h_ref_packed); uint32_t *d_ref_packed; - cudaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t)); - + cudaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t)); - cudaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), cudaMemcpyHostToDevice); - + cudaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), cudaMemcpyHostToDevice); std::vector h_reads_packed(num_reads * (MAX_READ_LEN / 16), 0); std::vector h_read_lengths(num_reads, 0); diff --git "a/05_seq_align/\346\236\227\344\271\220\345\244\251/main.maca" "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.maca" new file mode 100644 index 0000000..207b926 --- /dev/null +++ "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.maca" @@ -0,0 +1,572 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MATCH_SCORE 2 +#define MISMATCH_SCORE -1 +#define GAP_PENALTY -1 +#define MAX_READ_LEN 256 +#define KMER_SIZE 16 +constexpr int LOOKUP_PREFIX_BITS = 22; +constexpr uint32_t LOOKUP_BUCKETS = (1u << LOOKUP_PREFIX_BITS); +constexpr int LOOKUP_SHIFT = 32 - LOOKUP_PREFIX_BITS; + +struct Read { + std::string name; + std::string seq; +}; + +// Modified to hold read kmers instead of reference kmers +struct KmerPos { + uint32_t kmer; + int read_id; + int offset; + bool operator<(const KmerPos &other) const { return kmer < other.kmer; } +}; + +void load_reference(const std::string &filename, std::vector &names, std::string &concat_seq, + std::vector &offsets) { + int fd = open(filename.c_str(), O_RDONLY); + if (fd == -1) + exit(1); + + struct stat sb; + if (fstat(fd, &sb) == -1) + exit(1); + size_t size = sb.st_size; + + const char *mapped_ptr = static_cast(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0)); + if (mapped_ptr == MAP_FAILED) + exit(1); + close(fd); + + concat_seq.reserve(size); + + const char *ptr = mapped_ptr; + const char *end = ptr + size; + bool is_fastq = (ptr < end && *ptr == '@'); + + while (ptr < end) { + if (is_fastq) { + const char *name_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!name_end) + name_end = end; + std::string name(ptr + 1, name_end - (ptr + 1)); + if (!name.empty() && name.back() == '\r') + name.pop_back(); + ptr = name_end + (name_end < end ? 1 : 0); + + if (ptr >= end) + break; + const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!seq_end) + seq_end = end; + std::string seq(ptr, seq_end - ptr); + if (!seq.empty() && seq.back() == '\r') + seq.pop_back(); + ptr = seq_end + (seq_end < end ? 1 : 0); + + names.push_back(name); + offsets.push_back(concat_seq.size()); + concat_seq += seq; + + ptr = (const char *)memchr(ptr, '\n', end - ptr); + if (ptr) + ptr++; + else + break; + ptr = (const char *)memchr(ptr, '\n', end - ptr); + if (ptr) + ptr++; + else + break; + } else { + if (*ptr == '>') { + const char *name_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!name_end) + name_end = end; + std::string name(ptr + 1, name_end - (ptr + 1)); + if (!name.empty() && name.back() == '\r') + name.pop_back(); + names.push_back(name); + offsets.push_back(concat_seq.size()); + ptr = name_end + (name_end < end ? 1 : 0); + } else { + const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!seq_end) + seq_end = end; + size_t seq_len = seq_end - ptr; + if (seq_len > 0 && *(seq_end - 1) == '\r') + seq_len--; + concat_seq.append(ptr, seq_len); + ptr = seq_end + (seq_end < end ? 1 : 0); + } + } + } + munmap((void *)mapped_ptr, size); +} + +void load_reads(const std::string &filename, std::vector &reads) { + int fd = open(filename.c_str(), O_RDONLY); + if (fd == -1) + exit(1); + + struct stat sb; + if (fstat(fd, &sb) == -1) + exit(1); + size_t size = sb.st_size; + + const char *mapped_ptr = static_cast(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0)); + if (mapped_ptr == MAP_FAILED) + exit(1); + close(fd); + + reads.reserve(size / 150); + + const char *ptr = mapped_ptr; + const char *end = ptr + size; + + while (ptr < end) { + if (*ptr != '@') { + ptr = (const char *)memchr(ptr, '\n', end - ptr); + if (ptr) + ptr++; + else + break; + continue; + } + + Read r; + const char *name_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!name_end) + name_end = end; + r.name.assign(ptr + 1, name_end - (ptr + 1)); + if (!r.name.empty() && r.name.back() == '\r') + r.name.pop_back(); + ptr = name_end + (name_end < end ? 1 : 0); + + if (ptr >= end) + break; + + const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!seq_end) + seq_end = end; + r.seq.assign(ptr, seq_end - ptr); + if (!r.seq.empty() && r.seq.back() == '\r') + r.seq.pop_back(); + ptr = seq_end + (seq_end < end ? 1 : 0); + + ptr = (const char *)memchr(ptr, '\n', end - ptr); + if (ptr) + ptr++; + else + break; + ptr = (const char *)memchr(ptr, '\n', end - ptr); + if (ptr) + ptr++; + else + break; + + reads.push_back(std::move(r)); + } + munmap((void *)mapped_ptr, size); +} + +inline uint8_t char2val_cpu(char c) { return (c >> 1) & 0x03; } + +inline uint32_t pack_16mer(const char *seq) { + uint32_t packed = 0; + for (int i = 0; i < 16; ++i) { + uint8_t val = char2val_cpu(seq[i]); + packed |= (val << (30 - 2 * i)); + } + return packed; +} + +void encode_sequence_cpu(const std::string &seq, uint32_t *buffer) { + size_t length = seq.length(); + size_t encoded_size = (length + 15) / 16; + + // 小任务串行,避免 OMP 开销 + if (encoded_size < 4096) { + for (size_t i = 0; i < encoded_size; ++i) { + uint32_t packed = 0; + size_t start = i * 16; + for (int j = 0; j < 16; ++j) { + if (start + j < length) { + uint8_t val = char2val_cpu(seq[start + j]); + packed |= (uint32_t)(val << (30 - 2 * j)); + } + } + buffer[i] = packed; + } + return; + } + +#pragma omp parallel for + for (size_t i = 0; i < encoded_size; ++i) { + uint32_t packed = 0; + size_t start = i * 16; + for (int j = 0; j < 16; ++j) { + if (start + j < length) { + uint8_t val = char2val_cpu(seq[start + j]); + packed |= (uint32_t)(val << (30 - 2 * j)); + } + } + buffer[i] = packed; + } +} + +__device__ inline uint8_t get_base(const uint32_t *packed_array, size_t index) { + size_t array_idx = index / 16; + int bit_pos = 30 - 2 * (index % 16); + return (packed_array[array_idx] >> bit_pos) & 0x03; +} + +extern "C" __global__ void sw_extend_wavefront_kernel(const uint32_t *ref_packed, size_t ref_len, + const uint32_t *reads_packed, const int *read_lengths, + const int64_t *candidate_positions, + const int *candidate_read_indices, int total_candidates, + int *out_scores, int64_t *out_best_pos) { + int global_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32; + int local_warp_id = threadIdx.x / 32; + int lane_id = threadIdx.x % 32; + + if (global_warp_id >= total_candidates) + return; + + extern __shared__ short shared_buf[]; + + int warps_per_block = blockDim.x / 32; + short *H_row = &shared_buf[local_warp_id * MAX_READ_LEN]; + short *Start_row = &shared_buf[local_warp_id * MAX_READ_LEN + (MAX_READ_LEN * warps_per_block)]; + + int64_t ref_start = candidate_positions[global_warp_id]; + int read_idx = candidate_read_indices[global_warp_id]; + int read_len = read_lengths[read_idx]; + + const uint32_t *my_read_packed = reads_packed + read_idx * (MAX_READ_LEN / 16); + + int ref_region_len = read_len + 100; + if (ref_start + ref_region_len > ref_len) { + ref_region_len = ref_len - ref_start; + } + + int total_chunks = (read_len + 31) / 32; + + for (int chunk = 0; chunk < total_chunks; ++chunk) { + int j = chunk * 32 + lane_id; + if (j < read_len) { + H_row[j] = 0; + Start_row[j] = -1; + } + } + + __syncwarp(); + + short max_score = 0; + short best_ref_offset = 0; + + short h_diag_reg[8] = {0}; + short start_diag_reg[8] = {0}; + + short next_h_reg[8] = {0}; + short next_start_reg[8] = {-1}; + + int total_steps = ref_region_len + read_len - 1; + + for (int step = 0; step < total_steps; ++step) { + + for (int chunk = 0; chunk < total_chunks; ++chunk) { + int j = chunk * 32 + lane_id; + int i = step - j; + + if (i >= 0 && i < ref_region_len && j < read_len) { + int64_t current_ref_pos = ref_start + i; + uint8_t ref_base = get_base(ref_packed, current_ref_pos); + uint8_t read_base = get_base(my_read_packed, j); + + short match = (ref_base == read_base) ? MATCH_SCORE : MISMATCH_SCORE; + + short h_up = H_row[j]; + short start_up = Start_row[j]; + + short h_left = (j > 0) ? H_row[j - 1] : 0; + short start_left = (j > 0) ? Start_row[j - 1] : -1; + + short score_diag = h_diag_reg[chunk] + match; + short score_up = h_up + GAP_PENALTY; + short score_left = h_left + GAP_PENALTY; + + short score = 0; + short start = i; + + if (score_diag > 0 && score_diag >= score_up && score_diag >= score_left) { + score = score_diag; + start = (h_diag_reg[chunk] == 0) ? i : start_diag_reg[chunk]; + } else if (score_up > 0 && score_up >= score_left) { + score = score_up; + start = (h_up == 0) ? i : start_up; + } else if (score_left > 0) { + score = score_left; + start = (h_left == 0) ? i : start_left; + } + + // 【修复核心】将当前步的左侧值,作为下一步的对角线值缓存! + h_diag_reg[chunk] = h_left; + start_diag_reg[chunk] = start_left; + + next_h_reg[chunk] = score; + next_start_reg[chunk] = start; + + if (score > max_score) { + max_score = score; + best_ref_offset = start; + } + } + } + + __syncwarp(); + + for (int chunk = 0; chunk < total_chunks; ++chunk) { + int j = chunk * 32 + lane_id; + int i = step - j; + if (i >= 0 && i < ref_region_len && j < read_len) { + H_row[j] = next_h_reg[chunk]; + Start_row[j] = next_start_reg[chunk]; + } + } + + __syncwarp(); + } + + for (int offset = 16; offset > 0; offset /= 2) { + short shfl_score = __shfl_down_sync(0xffffffff, max_score, offset); + short shfl_offset = __shfl_down_sync(0xffffffff, best_ref_offset, offset); + if (shfl_score > max_score) { + max_score = shfl_score; + best_ref_offset = shfl_offset; + } + } + + if (lane_id == 0) { + out_scores[global_warp_id] = max_score; + out_best_pos[global_warp_id] = ref_start + best_ref_offset; + } +} + +int main() { + mcFree(0); + + std::string ref_file = "reference.fasta"; + std::string reads_file = "reads.fastq"; + std::ofstream out_file("results.txt"); + + std::vector ref_names; + std::vector ref_offsets; + std::string concat_ref; + + load_reference(ref_file, ref_names, concat_ref, ref_offsets); + + concat_ref.append(100, 'N'); + + std::vector reads; + load_reads(reads_file, reads); + + int num_reads = reads.size(); + + size_t ref_encoded_size = (concat_ref.length() + 15) / 16; + uint32_t *h_ref_packed = new uint32_t[ref_encoded_size]; + + encode_sequence_cpu(concat_ref, h_ref_packed); + + uint32_t *d_ref_packed; + mcMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t)); + + mcMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), mcMemcpyHostToDevice); + + std::vector h_reads_packed(num_reads * (MAX_READ_LEN / 16), 0); + std::vector h_read_lengths(num_reads, 0); + std::vector kmer_list; + kmer_list.reserve(num_reads * 50); + + for (int i = 0; i < num_reads; ++i) { + int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN); + h_read_lengths[i] = rlen; + encode_sequence_cpu(reads[i].seq, h_reads_packed.data() + i * (MAX_READ_LEN / 16)); + + for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) { + kmer_list.push_back({pack_16mer(reads[i].seq.c_str() + offset), i, offset}); + } + } + __gnu_parallel::sort(kmer_list.begin(), kmer_list.end()); + + // 22-bit prefix lookup + std::vector lookup(LOOKUP_BUCKETS + 1, (uint32_t)kmer_list.size()); + uint32_t prefix = 0; + lookup[0] = 0; + for (size_t i = 0; i < kmer_list.size(); ++i) { + uint32_t p = kmer_list[i].kmer >> LOOKUP_SHIFT; + while (prefix < p) + lookup[++prefix] = (uint32_t)i; + } + while (prefix < LOOKUP_BUCKETS) + lookup[++prefix] = (uint32_t)kmer_list.size(); + + std::vector> read_cands(num_reads); + + // rolling k-mer scan on reference + if (concat_ref.length() >= (KMER_SIZE + 100)) { + const int64_t max_pos = (int64_t)concat_ref.length() - KMER_SIZE - 100; // inclusive + +#pragma omp parallel + { + std::vector> local_cands(num_reads); + + const int tid = omp_get_thread_num(); + const int nth = omp_get_num_threads(); + + const int64_t total = max_pos + 1; // pos in [0, max_pos] + const int64_t chunk = (total + nth - 1) / nth; + const int64_t begin = tid * chunk; + const int64_t end = std::min(total, begin + chunk); + + if (begin < end) { + uint32_t kmer = pack_16mer(concat_ref.data() + begin); + + for (int64_t pos = begin; pos < end; ++pos) { + uint32_t p = kmer >> LOOKUP_SHIFT; + + for (uint32_t i = lookup[p]; i < lookup[p + 1]; ++i) { + if (kmer_list[i].kmer == kmer) { + int64_t start = pos - kmer_list[i].offset - 50; + if (start >= 0) + local_cands[kmer_list[i].read_id].push_back(start); + } + } + + // roll to next 16-mer + if (pos + 1 < end) { + kmer = (kmer << 2) | (uint32_t)char2val_cpu(concat_ref[pos + KMER_SIZE]); + } + } + } + +#pragma omp critical + { + for (int i = 0; i < num_reads; ++i) { + read_cands[i].insert(read_cands[i].end(), local_cands[i].begin(), local_cands[i].end()); + } + } + } + } + + std::vector h_candidate_pos; + std::vector h_candidate_read_idx; + + for (int i = 0; i < num_reads; ++i) { + if (read_cands[i].empty()) + continue; + std::sort(read_cands[i].begin(), read_cands[i].end()); + int64_t last_added = -1000; + int count = 0; + for (int64_t pos : read_cands[i]) { + if (pos > last_added + 50) { + h_candidate_pos.push_back(pos); + h_candidate_read_idx.push_back(i); + last_added = pos; + if (++count >= 100) + break; + } + } + } + + int total_candidates = h_candidate_pos.size(); + if (total_candidates == 0) { + for (const auto &r : reads) + out_file << r.name << " unknown_origin\n"; + return 0; + } + + uint32_t *d_reads_packed; + int *d_read_lengths, *d_candidate_read_idx, *d_out_scores; + int64_t *d_candidate_pos, *d_out_best_pos; + + mcMalloc(&d_reads_packed, h_reads_packed.size() * sizeof(uint32_t)); + mcMalloc(&d_read_lengths, num_reads * sizeof(int)); + mcMalloc(&d_candidate_pos, total_candidates * sizeof(int64_t)); + mcMalloc(&d_candidate_read_idx, total_candidates * sizeof(int)); + mcMalloc(&d_out_scores, total_candidates * sizeof(int)); + mcMalloc(&d_out_best_pos, total_candidates * sizeof(int64_t)); + + mcMemcpy(d_reads_packed, h_reads_packed.data(), h_reads_packed.size() * sizeof(uint32_t), mcMemcpyHostToDevice); + mcMemcpy(d_read_lengths, h_read_lengths.data(), num_reads * sizeof(int), mcMemcpyHostToDevice); + mcMemcpy(d_candidate_pos, h_candidate_pos.data(), total_candidates * sizeof(int64_t), mcMemcpyHostToDevice); + mcMemcpy(d_candidate_read_idx, h_candidate_read_idx.data(), total_candidates * sizeof(int), mcMemcpyHostToDevice); + + int blockSize = 128; // 4 Warps per Block + // Total threads needed = total_candidates * 32 + int numBlocks = (total_candidates * 32 + blockSize - 1) / blockSize; + + int warps_per_block = blockSize / 32; + // Shared memory: warps_per_block * (MAX_READ_LEN * 2 arrays) * sizeof(short) + size_t sharedMemSize = warps_per_block * MAX_READ_LEN * 2 * sizeof(short); + + sw_extend_wavefront_kernel<<>>( + d_ref_packed, concat_ref.length() - 100, d_reads_packed, d_read_lengths, d_candidate_pos, d_candidate_read_idx, + total_candidates, d_out_scores, d_out_best_pos); + + mcDeviceSynchronize(); + + std::vector h_out_scores(total_candidates); + std::vector h_out_best_pos(total_candidates); + mcMemcpy(h_out_scores.data(), d_out_scores, total_candidates * sizeof(int), mcMemcpyDeviceToHost); + mcMemcpy(h_out_best_pos.data(), d_out_best_pos, total_candidates * sizeof(int64_t), mcMemcpyDeviceToHost); + + std::vector best_score_per_read(num_reads, -1); + std::vector best_pos_per_read(num_reads, -1); + + for (int i = 0; i < total_candidates; ++i) { + int r_idx = h_candidate_read_idx[i]; + if (h_out_scores[i] > best_score_per_read[r_idx]) { + best_score_per_read[r_idx] = h_out_scores[i]; + best_pos_per_read[r_idx] = h_out_best_pos[i]; + } + } + + for (int i = 0; i < num_reads; ++i) { + if (best_score_per_read[i] > h_read_lengths[i] * 0.5) { + int64_t global_pos = best_pos_per_read[i]; + auto it = std::upper_bound(ref_offsets.begin(), ref_offsets.end(), global_pos); + int ref_idx = std::distance(ref_offsets.begin(), it) - 1; + int64_t local_pos = global_pos - ref_offsets[ref_idx]; + + out_file << reads[i].name << " " << ref_names[ref_idx] << " " << local_pos << "\n"; + } else { + out_file << reads[i].name << " unknown_origin\n"; + } + } + + mcFree(d_ref_packed); + mcFree(d_reads_packed); + mcFree(d_read_lengths); + mcFree(d_candidate_pos); + mcFree(d_candidate_read_idx); + mcFree(d_out_scores); + mcFree(d_out_best_pos); + delete[] h_ref_packed; + + return 0; +} diff --git "a/05_seq_align/\346\236\227\344\271\220\345\244\251/main.mu" "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.mu" new file mode 100644 index 0000000..81e0b22 --- /dev/null +++ "b/05_seq_align/\346\236\227\344\271\220\345\244\251/main.mu" @@ -0,0 +1,575 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MATCH_SCORE 2 +#define MISMATCH_SCORE -1 +#define GAP_PENALTY -1 +#define MAX_READ_LEN 256 +#define KMER_SIZE 16 +constexpr int LOOKUP_PREFIX_BITS = 22; +constexpr uint32_t LOOKUP_BUCKETS = (1u << LOOKUP_PREFIX_BITS); +constexpr int LOOKUP_SHIFT = 32 - LOOKUP_PREFIX_BITS; + +struct Read { + std::string name; + std::string seq; +}; + +// Modified to hold read kmers instead of reference kmers +struct KmerPos { + uint32_t kmer; + int read_id; + int offset; + bool operator<(const KmerPos &other) const { return kmer < other.kmer; } +}; + +void load_reference(const std::string &filename, std::vector &names, std::string &concat_seq, + std::vector &offsets) { + int fd = open(filename.c_str(), O_RDONLY); + if (fd == -1) + exit(1); + + struct stat sb; + if (fstat(fd, &sb) == -1) + exit(1); + size_t size = sb.st_size; + + const char *mapped_ptr = static_cast(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0)); + if (mapped_ptr == MAP_FAILED) + exit(1); + close(fd); + + concat_seq.reserve(size); + + const char *ptr = mapped_ptr; + const char *end = ptr + size; + bool is_fastq = (ptr < end && *ptr == '@'); + + while (ptr < end) { + if (is_fastq) { + const char *name_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!name_end) + name_end = end; + std::string name(ptr + 1, name_end - (ptr + 1)); + if (!name.empty() && name.back() == '\r') + name.pop_back(); + ptr = name_end + (name_end < end ? 1 : 0); + + if (ptr >= end) + break; + const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!seq_end) + seq_end = end; + std::string seq(ptr, seq_end - ptr); + if (!seq.empty() && seq.back() == '\r') + seq.pop_back(); + ptr = seq_end + (seq_end < end ? 1 : 0); + + names.push_back(name); + offsets.push_back(concat_seq.size()); + concat_seq += seq; + + ptr = (const char *)memchr(ptr, '\n', end - ptr); + if (ptr) + ptr++; + else + break; + ptr = (const char *)memchr(ptr, '\n', end - ptr); + if (ptr) + ptr++; + else + break; + } else { + if (*ptr == '>') { + const char *name_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!name_end) + name_end = end; + std::string name(ptr + 1, name_end - (ptr + 1)); + if (!name.empty() && name.back() == '\r') + name.pop_back(); + names.push_back(name); + offsets.push_back(concat_seq.size()); + ptr = name_end + (name_end < end ? 1 : 0); + } else { + const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!seq_end) + seq_end = end; + size_t seq_len = seq_end - ptr; + if (seq_len > 0 && *(seq_end - 1) == '\r') + seq_len--; + concat_seq.append(ptr, seq_len); + ptr = seq_end + (seq_end < end ? 1 : 0); + } + } + } + munmap((void *)mapped_ptr, size); +} + +void load_reads(const std::string &filename, std::vector &reads) { + int fd = open(filename.c_str(), O_RDONLY); + if (fd == -1) + exit(1); + + struct stat sb; + if (fstat(fd, &sb) == -1) + exit(1); + size_t size = sb.st_size; + + const char *mapped_ptr = static_cast(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0)); + if (mapped_ptr == MAP_FAILED) + exit(1); + close(fd); + + reads.reserve(size / 150); + + const char *ptr = mapped_ptr; + const char *end = ptr + size; + + while (ptr < end) { + if (*ptr != '@') { + ptr = (const char *)memchr(ptr, '\n', end - ptr); + if (ptr) + ptr++; + else + break; + continue; + } + + Read r; + const char *name_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!name_end) + name_end = end; + r.name.assign(ptr + 1, name_end - (ptr + 1)); + if (!r.name.empty() && r.name.back() == '\r') + r.name.pop_back(); + ptr = name_end + (name_end < end ? 1 : 0); + + if (ptr >= end) + break; + + const char *seq_end = (const char *)memchr(ptr, '\n', end - ptr); + if (!seq_end) + seq_end = end; + r.seq.assign(ptr, seq_end - ptr); + if (!r.seq.empty() && r.seq.back() == '\r') + r.seq.pop_back(); + ptr = seq_end + (seq_end < end ? 1 : 0); + + ptr = (const char *)memchr(ptr, '\n', end - ptr); + if (ptr) + ptr++; + else + break; + ptr = (const char *)memchr(ptr, '\n', end - ptr); + if (ptr) + ptr++; + else + break; + + reads.push_back(std::move(r)); + } + munmap((void *)mapped_ptr, size); +} + +inline uint8_t char2val_cpu(char c) { return (c >> 1) & 0x03; } + +inline uint32_t pack_16mer(const char *seq) { + uint32_t packed = 0; + for (int i = 0; i < 16; ++i) { + uint8_t val = char2val_cpu(seq[i]); + packed |= (val << (30 - 2 * i)); + } + return packed; +} + +void encode_sequence_cpu(const std::string &seq, uint32_t *buffer) { + size_t length = seq.length(); + size_t encoded_size = (length + 15) / 16; + + // 小任务串行,避免 OMP 开销 + if (encoded_size < 4096) { + for (size_t i = 0; i < encoded_size; ++i) { + uint32_t packed = 0; + size_t start = i * 16; + for (int j = 0; j < 16; ++j) { + if (start + j < length) { + uint8_t val = char2val_cpu(seq[start + j]); + packed |= (uint32_t)(val << (30 - 2 * j)); + } + } + buffer[i] = packed; + } + return; + } + +#pragma omp parallel for + for (size_t i = 0; i < encoded_size; ++i) { + uint32_t packed = 0; + size_t start = i * 16; + for (int j = 0; j < 16; ++j) { + if (start + j < length) { + uint8_t val = char2val_cpu(seq[start + j]); + packed |= (uint32_t)(val << (30 - 2 * j)); + } + } + buffer[i] = packed; + } +} + +__device__ inline uint8_t get_base(const uint32_t *packed_array, size_t index) { + size_t array_idx = index / 16; + int bit_pos = 30 - 2 * (index % 16); + return (packed_array[array_idx] >> bit_pos) & 0x03; +} + +extern "C" __global__ void sw_extend_wavefront_kernel(const uint32_t *ref_packed, size_t ref_len, + const uint32_t *reads_packed, const int *read_lengths, + const int64_t *candidate_positions, + const int *candidate_read_indices, int total_candidates, + int *out_scores, int64_t *out_best_pos) { + int global_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32; + int local_warp_id = threadIdx.x / 32; + int lane_id = threadIdx.x % 32; + + if (global_warp_id >= total_candidates) + return; + + extern __shared__ short shared_buf[]; + + int warps_per_block = blockDim.x / 32; + short *H_row = &shared_buf[local_warp_id * MAX_READ_LEN]; + short *Start_row = &shared_buf[local_warp_id * MAX_READ_LEN + (MAX_READ_LEN * warps_per_block)]; + + int64_t ref_start = candidate_positions[global_warp_id]; + int read_idx = candidate_read_indices[global_warp_id]; + int read_len = read_lengths[read_idx]; + + const uint32_t *my_read_packed = reads_packed + read_idx * (MAX_READ_LEN / 16); + + int ref_region_len = read_len + 100; + if (ref_start + ref_region_len > ref_len) { + ref_region_len = ref_len - ref_start; + } + + int total_chunks = (read_len + 31) / 32; + + for (int chunk = 0; chunk < total_chunks; ++chunk) { + int j = chunk * 32 + lane_id; + if (j < read_len) { + H_row[j] = 0; + Start_row[j] = -1; + } + } + + __syncwarp(); + + short max_score = 0; + short best_ref_offset = 0; + + short h_diag_reg[8] = {0}; + short start_diag_reg[8] = {0}; + + short next_h_reg[8] = {0}; + short next_start_reg[8] = {-1}; + + int total_steps = ref_region_len + read_len - 1; + + for (int step = 0; step < total_steps; ++step) { + + for (int chunk = 0; chunk < total_chunks; ++chunk) { + int j = chunk * 32 + lane_id; + int i = step - j; + + if (i >= 0 && i < ref_region_len && j < read_len) { + int64_t current_ref_pos = ref_start + i; + uint8_t ref_base = get_base(ref_packed, current_ref_pos); + uint8_t read_base = get_base(my_read_packed, j); + + short match = (ref_base == read_base) ? MATCH_SCORE : MISMATCH_SCORE; + + short h_up = H_row[j]; + short start_up = Start_row[j]; + + short h_left = (j > 0) ? H_row[j - 1] : 0; + short start_left = (j > 0) ? Start_row[j - 1] : -1; + + short score_diag = h_diag_reg[chunk] + match; + short score_up = h_up + GAP_PENALTY; + short score_left = h_left + GAP_PENALTY; + + short score = 0; + short start = i; + + if (score_diag > 0 && score_diag >= score_up && score_diag >= score_left) { + score = score_diag; + start = (h_diag_reg[chunk] == 0) ? i : start_diag_reg[chunk]; + } else if (score_up > 0 && score_up >= score_left) { + score = score_up; + start = (h_up == 0) ? i : start_up; + } else if (score_left > 0) { + score = score_left; + start = (h_left == 0) ? i : start_left; + } + + // 【修复核心】将当前步的左侧值,作为下一步的对角线值缓存! + h_diag_reg[chunk] = h_left; + start_diag_reg[chunk] = start_left; + + next_h_reg[chunk] = score; + next_start_reg[chunk] = start; + + if (score > max_score) { + max_score = score; + best_ref_offset = start; + } + } + } + + __syncwarp(); + + for (int chunk = 0; chunk < total_chunks; ++chunk) { + int j = chunk * 32 + lane_id; + int i = step - j; + if (i >= 0 && i < ref_region_len && j < read_len) { + H_row[j] = next_h_reg[chunk]; + Start_row[j] = next_start_reg[chunk]; + } + } + + __syncwarp(); + } + + for (int offset = 16; offset > 0; offset /= 2) { + short shfl_score = __shfl_down_sync(0xffffffff, max_score, offset); + short shfl_offset = __shfl_down_sync(0xffffffff, best_ref_offset, offset); + if (shfl_score > max_score) { + max_score = shfl_score; + best_ref_offset = shfl_offset; + } + } + + if (lane_id == 0) { + out_scores[global_warp_id] = max_score; + out_best_pos[global_warp_id] = ref_start + best_ref_offset; + } +} + +int main() { + musaFree(0); + + std::string ref_file = "reference.fasta"; + std::string reads_file = "reads.fastq"; + std::ofstream out_file("results.txt"); + + std::vector ref_names; + std::vector ref_offsets; + std::string concat_ref; + + load_reference(ref_file, ref_names, concat_ref, ref_offsets); + + concat_ref.append(100, 'N'); + + std::vector reads; + load_reads(reads_file, reads); + + int num_reads = reads.size(); + + size_t ref_encoded_size = (concat_ref.length() + 15) / 16; + uint32_t *h_ref_packed = new uint32_t[ref_encoded_size]; + + encode_sequence_cpu(concat_ref, h_ref_packed); + + uint32_t *d_ref_packed; + musaMalloc(&d_ref_packed, ref_encoded_size * sizeof(uint32_t)); + + + musaMemcpy(d_ref_packed, h_ref_packed, ref_encoded_size * sizeof(uint32_t), musaMemcpyHostToDevice); + + + std::vector h_reads_packed(num_reads * (MAX_READ_LEN / 16), 0); + std::vector h_read_lengths(num_reads, 0); + std::vector kmer_list; + kmer_list.reserve(num_reads * 50); + + for (int i = 0; i < num_reads; ++i) { + int rlen = std::min((int)reads[i].seq.length(), MAX_READ_LEN); + h_read_lengths[i] = rlen; + encode_sequence_cpu(reads[i].seq, h_reads_packed.data() + i * (MAX_READ_LEN / 16)); + + for (int offset = 0; offset <= rlen - KMER_SIZE; offset += 2) { + kmer_list.push_back({pack_16mer(reads[i].seq.c_str() + offset), i, offset}); + } + } + __gnu_parallel::sort(kmer_list.begin(), kmer_list.end()); + + // 22-bit prefix lookup + std::vector lookup(LOOKUP_BUCKETS + 1, (uint32_t)kmer_list.size()); + uint32_t prefix = 0; + lookup[0] = 0; + for (size_t i = 0; i < kmer_list.size(); ++i) { + uint32_t p = kmer_list[i].kmer >> LOOKUP_SHIFT; + while (prefix < p) + lookup[++prefix] = (uint32_t)i; + } + while (prefix < LOOKUP_BUCKETS) + lookup[++prefix] = (uint32_t)kmer_list.size(); + + std::vector> read_cands(num_reads); + + // rolling k-mer scan on reference + if (concat_ref.length() >= (KMER_SIZE + 100)) { + const int64_t max_pos = (int64_t)concat_ref.length() - KMER_SIZE - 100; // inclusive + +#pragma omp parallel + { + std::vector> local_cands(num_reads); + + const int tid = omp_get_thread_num(); + const int nth = omp_get_num_threads(); + + const int64_t total = max_pos + 1; // pos in [0, max_pos] + const int64_t chunk = (total + nth - 1) / nth; + const int64_t begin = tid * chunk; + const int64_t end = std::min(total, begin + chunk); + + if (begin < end) { + uint32_t kmer = pack_16mer(concat_ref.data() + begin); + + for (int64_t pos = begin; pos < end; ++pos) { + uint32_t p = kmer >> LOOKUP_SHIFT; + + for (uint32_t i = lookup[p]; i < lookup[p + 1]; ++i) { + if (kmer_list[i].kmer == kmer) { + int64_t start = pos - kmer_list[i].offset - 50; + if (start >= 0) + local_cands[kmer_list[i].read_id].push_back(start); + } + } + + // roll to next 16-mer + if (pos + 1 < end) { + kmer = (kmer << 2) | (uint32_t)char2val_cpu(concat_ref[pos + KMER_SIZE]); + } + } + } + +#pragma omp critical + { + for (int i = 0; i < num_reads; ++i) { + read_cands[i].insert(read_cands[i].end(), local_cands[i].begin(), local_cands[i].end()); + } + } + } + } + + std::vector h_candidate_pos; + std::vector h_candidate_read_idx; + + for (int i = 0; i < num_reads; ++i) { + if (read_cands[i].empty()) + continue; + std::sort(read_cands[i].begin(), read_cands[i].end()); + int64_t last_added = -1000; + int count = 0; + for (int64_t pos : read_cands[i]) { + if (pos > last_added + 50) { + h_candidate_pos.push_back(pos); + h_candidate_read_idx.push_back(i); + last_added = pos; + if (++count >= 100) + break; + } + } + } + + int total_candidates = h_candidate_pos.size(); + if (total_candidates == 0) { + for (const auto &r : reads) + out_file << r.name << " unknown_origin\n"; + return 0; + } + + uint32_t *d_reads_packed; + int *d_read_lengths, *d_candidate_read_idx, *d_out_scores; + int64_t *d_candidate_pos, *d_out_best_pos; + + musaMalloc(&d_reads_packed, h_reads_packed.size() * sizeof(uint32_t)); + musaMalloc(&d_read_lengths, num_reads * sizeof(int)); + musaMalloc(&d_candidate_pos, total_candidates * sizeof(int64_t)); + musaMalloc(&d_candidate_read_idx, total_candidates * sizeof(int)); + musaMalloc(&d_out_scores, total_candidates * sizeof(int)); + musaMalloc(&d_out_best_pos, total_candidates * sizeof(int64_t)); + + musaMemcpy(d_reads_packed, h_reads_packed.data(), h_reads_packed.size() * sizeof(uint32_t), musaMemcpyHostToDevice); + musaMemcpy(d_read_lengths, h_read_lengths.data(), num_reads * sizeof(int), musaMemcpyHostToDevice); + musaMemcpy(d_candidate_pos, h_candidate_pos.data(), total_candidates * sizeof(int64_t), musaMemcpyHostToDevice); + musaMemcpy(d_candidate_read_idx, h_candidate_read_idx.data(), total_candidates * sizeof(int), + musaMemcpyHostToDevice); + + int blockSize = 128; // 4 Warps per Block + // Total threads needed = total_candidates * 32 + int numBlocks = (total_candidates * 32 + blockSize - 1) / blockSize; + + int warps_per_block = blockSize / 32; + // Shared memory: warps_per_block * (MAX_READ_LEN * 2 arrays) * sizeof(short) + size_t sharedMemSize = warps_per_block * MAX_READ_LEN * 2 * sizeof(short); + + sw_extend_wavefront_kernel<<>>( + d_ref_packed, concat_ref.length() - 100, d_reads_packed, d_read_lengths, d_candidate_pos, d_candidate_read_idx, + total_candidates, d_out_scores, d_out_best_pos); + + musaDeviceSynchronize(); + + std::vector h_out_scores(total_candidates); + std::vector h_out_best_pos(total_candidates); + musaMemcpy(h_out_scores.data(), d_out_scores, total_candidates * sizeof(int), musaMemcpyDeviceToHost); + musaMemcpy(h_out_best_pos.data(), d_out_best_pos, total_candidates * sizeof(int64_t), musaMemcpyDeviceToHost); + + std::vector best_score_per_read(num_reads, -1); + std::vector best_pos_per_read(num_reads, -1); + + for (int i = 0; i < total_candidates; ++i) { + int r_idx = h_candidate_read_idx[i]; + if (h_out_scores[i] > best_score_per_read[r_idx]) { + best_score_per_read[r_idx] = h_out_scores[i]; + best_pos_per_read[r_idx] = h_out_best_pos[i]; + } + } + + for (int i = 0; i < num_reads; ++i) { + if (best_score_per_read[i] > h_read_lengths[i] * 0.5) { + int64_t global_pos = best_pos_per_read[i]; + auto it = std::upper_bound(ref_offsets.begin(), ref_offsets.end(), global_pos); + int ref_idx = std::distance(ref_offsets.begin(), it) - 1; + int64_t local_pos = global_pos - ref_offsets[ref_idx]; + + out_file << reads[i].name << " " << ref_names[ref_idx] << " " << local_pos << "\n"; + } else { + out_file << reads[i].name << " unknown_origin\n"; + } + } + + musaFree(d_ref_packed); + musaFree(d_reads_packed); + musaFree(d_read_lengths); + musaFree(d_candidate_pos); + musaFree(d_candidate_read_idx); + musaFree(d_out_scores); + musaFree(d_out_best_pos); + delete[] h_ref_packed; + + return 0; +}