Coverage for src/optwps/optwps.py: 81%
147 statements
« prev ^ index » next coverage.py v7.11.1, created at 2025-11-09 23:08 +0100
« prev ^ index » next coverage.py v7.11.1, created at 2025-11-09 23:08 +0100
1#!/usr/bin/env python
2"""
3Fast Window Protection Score (WPS) Calculator for Cell-Free DNA Analysis.
5This module provides efficient computation of Window Protection Scores from BAM files,
6designed for analyzing cell-free DNA fragmentation patterns and nucleosome positioning.
8The WPS algorithm identifies protected genomic regions by analyzing DNA fragment
9coverage patterns. It counts fragments that span versus fragments whose endpoints
10fall within a protection window around each genomic position.
12Example:
13 Basic usage::
15 from optwps import WPS
17 wps_calc = WPS(protection_size=120)
18 wps_calc.run(bamfile='input.bam', out_filepath='output.tsv')
20Note:
21 - Op BAM Description
22 - M 0 alignment match (can be a sequence match or mismatch)
23 - I 1 insertion to the reference
24 - D 2 deletion from the reference
25 - N 3 skipped region from the reference
26 - S 4 soft clipping (clipped sequences present in SEQ)
27 - H 5 hard clipping (clipped sequences NOT present in SEQ)
28 - P 6 padding (silent deletion from padded reference)
29 - = 7 sequence match
30 - X 8 sequence mismatch
31 - pysam always uses 0-based coordinates, converting from 1-based of the sam files!
32"""
34import os
35import pysam
36import random
37import numpy as np
39from src.optwps.utils import exopen, is_soft_clipped, ref_aln_length
40from tqdm.auto import tqdm
43class ROIGenerator:
44 """
45 Generate regions of interest (ROI) for processing.
47 This class generates genomic regions either from a BED file or from the entire
48 genome in the BAM file. Regions are yielded in chunks for memory-efficient processing.
50 Args:
51 bed_file (str, optional): Path to BED file containing regions to process.
52 If None, the entire genome will be processed. Default: None
53 chunk_size (int, optional): Size of chunks in base pairs for processing.
54 Default: 1e8 (100 megabases)
55 """
57 def __init__(self, bed_file=None, chunk_size=1e8):
58 self.bed_file = bed_file
59 self.chunk_size = chunk_size
61 def regions(self, bam_file=None):
62 """
63 Generate regions for processing.
65 Yields genomic regions either from a BED file or from the entire genome
66 referenced in the BAM file. Regions are chunked according to chunk_size.
68 Args:
69 bam_file (str, optional): Path to BAM file. Required if no BED file
70 is provided to determine genome regions. Default: None
72 Yields:
73 tuple: (chromosome, start, end) for each region chunk
75 Raises:
76 ValueError: If neither bed_file nor bam_file can provide regions
77 """
78 if (self.bed_file is None) or (not os.path.exists(self.bed_file)):
79 input_file = pysam.Samfile(bam_file, "rb")
80 nchunks = sum(
81 (input_file.get_reference_length(chrom) - 1) // self.chunk_size + 1
82 for chrom in input_file.references
83 )
84 iterator = tqdm(total=nchunks, desc="Processing genome regions")
85 for chrom in input_file.references:
86 chrom_length = input_file.get_reference_length(chrom)
87 region_start = 0
88 while region_start < chrom_length:
89 region_end = min(region_start + self.chunk_size, chrom_length)
90 yield chrom, region_start, region_end
91 region_start = region_end
92 iterator.update(1)
93 iterator.close()
94 else:
95 # read number of lines in bed file
96 nlines = sum(1 for _ in exopen(self.bed_file, "r"))
97 with exopen(self.bed_file, "r") as bed:
98 for line in tqdm(bed, total=nlines, desc="Processing BED regions"):
99 chrom, start, end = line.split()[:3]
100 chunk_start = int(start)
101 chunk_end = min(chunk_start + self.chunk_size, int(end))
102 while chunk_start < int(end):
103 yield chrom, chunk_start, chunk_end
104 chunk_start = chunk_end
105 chunk_end = min(chunk_start + self.chunk_size, int(end))
108class WPS:
109 """
110 Window Protection Score (WPS) calculator for cell-free DNA analysis.
112 This class computes Window Protection Scores from aligned sequencing reads in BAM format.
113 WPS quantifies the protection of DNA fragments around each genomic position, useful for
114 identifying nucleosome positioning and other protected regions in cell-free DNA.
116 The algorithm calculates:
117 - Outside score: fragments that completely span the protection window
118 - Inside score: fragment endpoints falling within the protection window
119 - WPS = outside - inside
121 Args:
122 bed_file (str, optional): Path to BED file with regions to process.
123 If None, processes entire genome. Default: None
124 protection_size (int, optional): Total protection window size in base pairs.
125 This value is divided by 2 to get the window on each side of the position.
126 Default: 120
127 min_insert_size (int, optional): Minimum insert/fragment size to include.
128 If None, no minimum filter applied. Default: None
129 max_insert_size (int, optional): Maximum insert/fragment size to include.
130 If None, no maximum filter applied. Default: None
131 valid_chroms (set, optional): Set of valid chromosome names to process.
132 Default: chromosomes 1-22, X, Y
133 chunk_size (float, optional): Region chunk size for processing.
134 Default: 1e8 (100 Mb)
136 Attributes:
137 bed_file (str): Path to BED file or None
138 protection_size (int): Half of the protection window size
139 valid_chroms (set): Set of valid chromosome names
140 min_insert_size (int): Minimum fragment size filter
141 max_insert_size (int): Maximum fragment size filter
142 chunk_size (float): Chunk size for processing
143 roi_generator (ROIGenerator): Region generator instance
145 Example:
146 >>> wps = WPS(protection_size=120, min_insert_size=120, max_insert_size=180)
147 >>> wps.run(bamfile='sample.bam', out_filepath='wps_output.tsv')
149 Note:
150 - Automatically filters duplicate, QC-failed, and unmapped reads
151 - Handles both paired-end and single-end sequencing data
152 - Supports downsampling for high-coverage samples
153 """
155 def __init__(
156 self,
157 bed_file=None,
158 protection_size=120,
159 min_insert_size=None,
160 max_insert_size=None,
161 valid_chroms=set(map(str, list(range(1, 23)) + ["X", "Y"])),
162 chunk_size=1e8,
163 ):
164 self.bed_file = bed_file
165 self.protection_size = protection_size // 2
166 if valid_chroms is not None: 166 ↛ 169line 166 didn't jump to line 169 because the condition on line 166 was always true
167 self.valid_chroms = [x.replace("chr", "") for x in valid_chroms]
168 else:
169 self.valid_chroms = None
170 self.min_insert_size = min_insert_size
171 self.max_insert_size = max_insert_size
172 self.chunk_size = chunk_size
173 self.roi_generator = ROIGenerator(bed_file=self.bed_file)
175 def __call__(self, *args, **kwargs):
176 return self.run(*args, **kwargs)
178 def run(
179 self,
180 bamfile,
181 out_filepath=None,
182 downsample_ratio=None,
183 verbose_output=False,
184 ):
185 """
186 Calculate Window Protection Score for all regions and write to file.
188 Processes the BAM file to compute WPS values for each genomic position
189 in the specified regions (or entire genome). Results are written to a
190 tab-separated output file.
192 Args:
193 bamfile (str): Path to input BAM file (must be sorted and indexed)
194 out_filepath (str): Path to output TSV file, or stdout (default)
195 downsample_ratio (float, optional): Fraction of reads to randomly keep
196 (0.0 to 1.0). Useful for high-coverage samples. Default: None (no downsampling)
198 Returns:
199 None: Results are written directly to out_filepath
201 Output Format:
202 Tab-separated file with columns:
203 1. chromosome: Chromosome name (without 'chr' prefix)
204 2. start: Start position (0-based)
205 3. end: End position (1-based, start + 1)
206 4. outside: Count of fragments spanning the protection window (if verbose_output)
207 5. inside: Count of fragment endpoints in protection window (if verbose_output)
208 6. wps: Window Protection Score (outside - inside)
210 Raises:
211 FileNotFoundError: If bamfile does not exist
212 ValueError: If downsample_ratio is not between 0 and 1
214 Example:
215 >>> wps = WPS()
216 >>> wps.run(bamfile='input.bam', out_filepath='output.tsv')
217 >>> # With downsampling
218 >>> wps.run(bamfile='input.bam', out_filepath='output.tsv',
219 ... downsample_ratio=0.5)
220 """
221 if out_filepath is None:
222 out_filepath = "stdout"
223 input_file = pysam.Samfile(bamfile, "rb")
224 prefix = (
225 "chr" if any(r.startswith("chr") for r in input_file.references) else ""
226 )
227 with exopen(out_filepath, "w") as outfile:
229 for chrom, start, end in self.roi_generator.regions(bam_file=bamfile):
230 if "chr" in chrom: 230 ↛ 232line 230 didn't jump to line 232 because the condition on line 230 was always true
231 chrom = chrom.replace("chr", "")
232 if self.valid_chroms is not None and chrom not in self.valid_chroms:
233 continue
234 try:
235 regionStart, regionEnd = int(start), int(end)
236 except ValueError:
237 continue
239 starts = []
240 ends = []
241 for read in input_file.fetch(
242 prefix + chrom,
243 max(0, regionStart - self.protection_size - 1),
244 regionEnd + self.protection_size + 1,
245 ):
246 if read.is_duplicate or read.is_qcfail or read.is_unmapped: 246 ↛ 247line 246 didn't jump to line 247 because the condition on line 246 was never true
247 continue
248 if is_soft_clipped(read.cigartuples): 248 ↛ 249line 248 didn't jump to line 249 because the condition on line 248 was never true
249 continue
251 if read.is_paired:
252 if read.mate_is_unmapped: 252 ↛ 253line 252 didn't jump to line 253 because the condition on line 252 was never true
253 continue
254 if read.rnext != read.tid: 254 ↛ 255line 254 didn't jump to line 255 because the condition on line 254 was never true
255 continue
256 if read.is_read1 or (
257 read.is_read2
258 and read.pnext + read.qlen
259 < regionStart - self.protection_size - 1
260 ):
261 if read.isize == 0: 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true
262 continue
263 if (
264 downsample_ratio is not None
265 and random.random() >= downsample_ratio
266 ):
267 continue
268 lseq = abs(read.isize)
269 if (
270 self.min_insert_size is not None
271 and lseq < self.min_insert_size
272 ):
273 continue
274 if (
275 self.max_insert_size is not None
276 and lseq > self.max_insert_size
277 ):
278 continue
279 rstart = min(read.pos, read.pnext)
280 rend = rstart + lseq - 1
281 starts.append(rstart)
282 ends.append(rend)
283 else:
284 if (
285 downsample_ratio is not None
286 and random.random() >= downsample_ratio
287 ):
288 continue
289 rstart = read.pos
290 lseq = ref_aln_length(read.cigartuples)
291 if self.min_insert_size is not None and (
292 (lseq < self.min_insert_size)
293 or (lseq > self.max_insert_size)
294 ):
295 continue
296 rend = rstart + lseq - 1 # end included
297 starts.append(rstart)
298 ends.append(rend)
299 n = regionEnd - regionStart + 1
300 if len(starts) > 0:
301 starts = np.array(starts)
302 ends = np.array(ends)
304 # Fragments fully spanning the window boundaries
305 span_start = starts + self.protection_size - regionStart
306 span_end = ends - self.protection_size - regionStart + 2
307 valid = span_end >= span_start
308 outside = np.zeros(n + 2, dtype=int)
309 np.add.at(
310 outside,
311 np.clip(span_start[valid] + 1, 0, n + 1),
312 1,
313 )
314 np.add.at(
315 outside,
316 np.clip(span_end[valid], 0, n + 1),
317 -1,
318 )
319 np.add.at(
320 outside,
321 np.clip(span_start[~valid] + 1, 0, n + 1),
322 -1,
323 )
324 np.add.at(
325 outside,
326 np.clip(span_end[~valid], 0, n + 1),
327 1,
328 )
330 outside_cum = np.cumsum(outside)[:-1]
332 # Fragments whose endpoints fall inside windows
333 all_ends = np.concatenate([starts, ends]) - regionStart
334 left = np.clip(all_ends - self.protection_size + 2, 0, n + 1)
335 right = np.clip(all_ends + self.protection_size + 1, 0, n + 1)
336 inside = np.zeros(n + 2, dtype=int)
337 np.add.at(inside, left, 1)
338 np.add.at(inside, right, -1)
339 inside_cum = np.cumsum(inside)[:-1]
341 wps = outside_cum - inside_cum
342 else:
343 outside_cum = np.zeros(n, dtype=int)
344 inside_cum = np.zeros(n, dtype=int)
345 wps = np.zeros(n, dtype=int)
346 for i in range(regionEnd - regionStart + 1):
347 if verbose_output:
348 outfile.write(
349 "%s\t%d\t%d\t%d\t%d\t%d\n"
350 % (
351 chrom,
352 regionStart + i,
353 regionStart + i + 1, # add end coordinate
354 outside_cum[i],
355 inside_cum[i],
356 wps[i],
357 )
358 )
359 else:
360 outfile.write(
361 "%s\t%d\t%d\t%d\n"
362 % (
363 chrom,
364 regionStart + i,
365 regionStart + i + 1, # add end coordinate
366 wps[i],
367 )
368 )
371def main():
372 """Main entry point for the command-line interface."""
373 from argparse import ArgumentParser
375 parser = ArgumentParser()
376 parser.add_argument(
377 "-i",
378 "--input",
379 dest="input",
380 help="Input BAM file",
381 required=True,
382 )
383 parser.add_argument(
384 "-o",
385 "--outfile",
386 dest="outfile",
387 help="The output file path for WPS results. If not provided, results will be printed to stdout.",
388 required=False,
389 )
390 parser.add_argument(
391 "-r",
392 "--regions",
393 dest="regions",
394 help="BED file with regions of interest (default: whole genome)",
395 default=None,
396 )
397 parser.add_argument(
398 "-w",
399 "--protection",
400 dest="protection",
401 help="Base pair protection window (default: 120)",
402 default=120,
403 type=int,
404 )
405 parser.add_argument(
406 "--min-insert-size",
407 dest="min_insert_size",
408 help="Minimum read length threshold to consider (Optional)",
409 default=None,
410 type=int,
411 )
412 parser.add_argument(
413 "--max-insert-size",
414 dest="max_insert_size",
415 help="Minimum read length threshold to consider (Optional)",
416 default=None,
417 type=int,
418 )
419 parser.add_argument(
420 "--downsample",
421 dest="downsample",
422 help="Ratio to down sample reads (default OFF)",
423 default=None,
424 type=float,
425 )
426 parser.add_argument(
427 "--chunk-size",
428 dest="chunk_size",
429 help="Chunk size for processing in pieces, in case of low memory (default 1e8)",
430 default=1e8,
431 type=int,
432 )
433 parser.add_argument(
434 "--valid-chroms",
435 dest="valid_chroms",
436 help="Comma-separated list of valid chromosomes to include (e.g., '1,2,3,X,Y') or 'canonical' for chromosomes 1-22, X, Y. Optional",
437 default=None,
438 type=str,
439 )
440 parser.add_argument(
441 "--verbose-output",
442 dest="verbose_output",
443 help="If provided, output will include separate counts for 'outside' and 'inside' along with WPS.",
444 action="store_true",
445 )
446 args = parser.parse_args()
447 valid_chroms = None
448 if args.valid_chroms == "canonical":
449 valid_chroms = [str(i) for i in range(1, 23)] + ["X", "Y"]
450 else:
451 valid_chroms = args.valid_chroms.split(",") if args.valid_chroms else None
452 optwps = WPS(
453 bed_file=args.regions,
454 protection_size=args.protection,
455 min_insert_size=args.min_insert_size,
456 max_insert_size=args.max_insert_size,
457 chunk_size=args.chunk_size,
458 valid_chroms=valid_chroms,
459 )
460 optwps.run(
461 bamfile=args.input,
462 out_filepath=args.outfile,
463 downsample_ratio=args.downsample,
464 verbose_output=args.verbose_output,
465 )
468if __name__ == "__main__": 468 ↛ 469line 468 didn't jump to line 469 because the condition on line 468 was never true
469 main()