Coverage for src/optwps/optwps.py: 84%
201 statements
« prev ^ index » next coverage.py v7.11.1, created at 2025-11-10 19:23 +0100
« prev ^ index » next coverage.py v7.11.1, created at 2025-11-10 19:23 +0100
1#!/usr/bin/env python
2"""
3Fast Window Protection Score (WPS) Calculator for Cell-Free DNA Analysis.
5This module provides efficient computation of Window Protection Scores from BAM files,
6designed for analyzing cell-free DNA fragmentation patterns and nucleosome positioning.
8The WPS algorithm identifies protected genomic regions by analyzing DNA fragment
9coverage patterns. It counts fragments that span versus fragments whose endpoints
10fall within a protection window around each genomic position.
12Example:
13 Basic usage::
15 from optwps import WPS
17 wps_calc = WPS(protection_size=120)
18 wps_calc.run(bamfile='input.bam', out_filepath='output.tsv')
20 Creating separate output files::
22 # Per chromosome
23 wps_calc.run(bamfile='input.bam', out_filepath='wps_{chrom}.tsv')
25 # Per target region
26 wps_calc.run(bamfile='input.bam', out_filepath='wps_{target}.tsv.gz')
28Note:
29 - Op BAM Description
30 - M 0 alignment match (can be a sequence match or mismatch)
31 - I 1 insertion to the reference
32 - D 2 deletion from the reference
33 - N 3 skipped region from the reference
34 - S 4 soft clipping (clipped sequences present in SEQ)
35 - H 5 hard clipping (clipped sequences NOT present in SEQ)
36 - P 6 padding (silent deletion from padded reference)
37 - = 7 sequence match
38 - X 8 sequence mismatch
39 - pysam always uses 0-based coordinates, converting from 1-based of the sam files!
40"""
42import os
43import sys
44import pysam
45import random
46import numpy as np
47import pandas as pd
49from optwps.utils import exopen, is_soft_clipped, ref_aln_length
50from tqdm.auto import tqdm
53class ROIGenerator:
54 """
55 Generate regions of interest (ROI) for processing.
57 This class generates genomic regions either from a BED file or from the entire
58 genome in the BAM file. Regions are yielded in chunks for memory-efficient processing.
60 Args:
61 bed_file (str, optional): Path to BED file containing regions to process.
62 If None, the entire genome will be processed. Default: None
63 chunk_size (int, optional): Size of chunks in base pairs for processing.
64 Default: 1e8 (100 megabases)
65 """
67 def __init__(self, bed_file=None, chunk_size=1e8):
68 self.bed_file = bed_file
69 self.chunk_size = chunk_size
71 def regions(self, bam_file=None):
72 """
73 Generate regions for processing.
75 Yields genomic regions either from a BED file or from the entire genome
76 referenced in the BAM file. Regions are chunked according to chunk_size.
78 Args:
79 bam_file (str, optional): Path to BAM file. Required if no BED file
80 is provided to determine genome regions. Default: None
82 Yields:
83 tuple: (chromosome, chunk_start, chunk_end, region_start, region_end)
84 where chunk_start/chunk_end define the current chunk being processed
85 and region_start/region_end define the full original region boundaries
87 Raises:
88 ValueError: If neither bed_file nor bam_file can provide regions
89 """
90 if (self.bed_file is None) or (not os.path.exists(self.bed_file)):
91 input_file = pysam.Samfile(bam_file, "rb")
92 nchunks = sum(
93 (input_file.get_reference_length(chrom) - 1) // self.chunk_size + 1
94 for chrom in input_file.references
95 )
96 iterator = tqdm(total=nchunks, desc="Processing genome regions")
97 for chrom in input_file.references:
98 chrom_length = input_file.get_reference_length(chrom)
99 region_start = 0
100 while region_start < chrom_length:
101 region_end = min(region_start + self.chunk_size, chrom_length)
102 yield chrom, region_start, region_end, 0, chrom_length
103 region_start = region_end
104 iterator.update(1)
105 iterator.close()
106 else:
107 # read number of lines in bed file
108 nlines = sum(1 for _ in exopen(self.bed_file, "r"))
109 with exopen(self.bed_file, "r") as bed:
110 for line in tqdm(bed, total=nlines, desc="Processing BED regions"):
111 chrom, start, end = line.split()[:3]
112 chunk_start = int(start)
113 chunk_end = min(chunk_start + self.chunk_size, int(end))
114 while chunk_start < int(end):
115 yield chrom, chunk_start, chunk_end, int(start), int(end)
116 chunk_start = chunk_end
117 chunk_end = min(chunk_start + self.chunk_size, int(end))
120class CMWriterUnpacker:
121 """
122 Wrapper for file handles to provide consistent write and close interface.
124 This class handles both context managers and direct file handles, providing
125 a unified interface for writing data and closing files. It also prevents
126 closing stdout to avoid unexpected behavior.
128 Args:
129 cm: A context manager or file handle object
131 Attributes:
132 cm: The original context manager
133 handle: The unpacked file handle
134 """
136 def __init__(self, cm):
137 self.cm = cm
138 try:
139 self.handle = self.cm.__enter__()
140 except AttributeError:
141 self.handle = self.cm
143 def write(self, data):
144 """Write data to the file handle."""
145 self.handle.write(data)
147 def close(self):
148 """Close the file handle, avoiding closure of stdout."""
149 try:
150 if self.handle != sys.stdout:
151 self.handle.close()
152 except AttributeError:
153 self.cm.__exit__(None, None, None)
156class WPS:
157 """
158 Window Protection Score (WPS) calculator for cell-free DNA analysis.
160 This class computes Window Protection Scores from aligned sequencing reads in BAM format.
161 WPS quantifies the protection of DNA fragments around each genomic position, useful for
162 identifying nucleosome positioning and other protected regions in cell-free DNA.
164 The algorithm calculates:
165 - Outside score: fragments that completely span the protection window
166 - Inside score: fragment endpoints falling within the protection window
167 - WPS = outside - inside
169 Args:
170 bed_file (str, optional): Path to BED file with regions to process.
171 If None, processes entire genome. Default: None
172 protection_size (int, optional): Total protection window size in base pairs.
173 This value is divided by 2 to get the window on each side of the position.
174 Default: 120
175 min_insert_size (int, optional): Minimum insert/fragment size to include.
176 If None, no minimum filter applied. Default: None
177 max_insert_size (int, optional): Maximum insert/fragment size to include.
178 If None, no maximum filter applied. Default: None
179 valid_chroms (set, optional): Set of valid chromosome names to process.
180 Default: chromosomes 1-22, X, Y
181 chunk_size (float, optional): Region chunk size for processing.
182 Default: 1e8 (100 Mb)
184 Attributes:
185 bed_file (str): Path to BED file or None
186 protection_size (int): Half of the protection window size
187 valid_chroms (set): Set of valid chromosome names
188 min_insert_size (int): Minimum fragment size filter
189 max_insert_size (int): Maximum fragment size filter
190 chunk_size (float): Chunk size for processing
191 roi_generator (ROIGenerator): Region generator instance
193 Example:
194 >>> wps = WPS(protection_size=120, min_insert_size=120, max_insert_size=180)
195 >>> wps.run(bamfile='sample.bam', out_filepath='wps_output.tsv')
196 >>> # Process specific chromosomes
197 >>> wps = WPS(valid_chroms={'1', '2', '3', 'X', 'Y'})
198 >>> wps.run(bamfile='sample.bam', out_filepath='chr_wps.tsv')
200 Note:
201 - Automatically filters duplicate, QC-failed, and unmapped reads
202 - Handles both paired-end and single-end sequencing data
203 - Supports downsampling for high-coverage samples
204 - Can write to separate files per chromosome or target region using placeholders
205 """
207 def __init__(
208 self,
209 bed_file=None,
210 protection_size=120,
211 min_insert_size=None,
212 max_insert_size=None,
213 valid_chroms=set(map(str, list(range(1, 23)) + ["X", "Y"])),
214 chunk_size=1e8,
215 ):
216 self.bed_file = bed_file
217 self.protection_size = protection_size // 2
218 if valid_chroms is not None: 218 ↛ 221line 218 didn't jump to line 221 because the condition on line 218 was always true
219 self.valid_chroms = [x.replace("chr", "") for x in valid_chroms]
220 else:
221 self.valid_chroms = None
222 self.min_insert_size = min_insert_size
223 self.max_insert_size = max_insert_size
224 self.chunk_size = chunk_size
225 self.roi_generator = ROIGenerator(bed_file=self.bed_file)
227 def __call__(self, *args, **kwargs):
228 return self.run(*args, **kwargs)
230 def run(
231 self,
232 bamfile,
233 out_filepath=None,
234 downsample_ratio=None,
235 compute_coverage=False,
236 verbose_output=False,
237 ):
238 """
239 Calculate Window Protection Score for all regions and write to file.
241 Processes the BAM file to compute WPS values for each genomic position
242 in the specified regions (or entire genome). Results are written to a
243 tab-separated output file.
245 Args:
246 bamfile (str): Path to input BAM file (must be sorted and indexed)
247 out_filepath (str): Path to output TSV file, or stdout (default).
248 If it contains a formatting substring {target} or {chrom}, it will be used to create per-target or
249 per-chromosome files. Both can be used together.
250 downsample_ratio (float, optional): Fraction of reads to randomly keep
251 (0.0 to 1.0). Useful for high-coverage samples. Default: None (no downsampling)
252 compute_coverage (bool, optional): Whether to compute and include base coverage
253 verbose_output (bool, optional): Whether to include detailed counts
255 Returns:
256 None: Results are written directly to out_filepath
258 Output Format:
259 Tab-separated file with columns:
260 - chromosome: Chromosome name (without 'chr' prefix)
261 - start: Start position (0-based)
262 - end: End position (1-based, start + 1)
263 - base read coverage (if compute_coverage=True)
264 - outside: Count of fragments spanning the protection window (if verbose_output=True)
265 - inside: Count of fragment endpoints in protection window (if verbose_output=True)
266 - wps: Window Protection Score (outside - inside)
268 Raises:
269 FileNotFoundError: If bamfile does not exist
270 ValueError: If downsample_ratio is not between 0 and 1
272 Example:
273 >>> wps = WPS()
274 >>> wps.run(bamfile='input.bam', out_filepath='output.tsv')
275 >>> # With downsampling
276 >>> wps.run(bamfile='input.bam', out_filepath='output.tsv',
277 ... downsample_ratio=0.5)
278 >>> # Creating separate files per chromosome
279 >>> wps.run(bamfile='input.bam', out_filepath='wps_{chrom}.tsv')
280 >>> # Creating separate files per target region
281 >>> wps.run(bamfile='input.bam', out_filepath='wps_{target}.tsv.gz')
282 """
283 if out_filepath is None:
284 out_filepath = "stdout"
285 input_file = pysam.Samfile(bamfile, "rb")
286 prefix = (
287 "chr" if any(r.startswith("chr") for r in input_file.references) else ""
288 )
289 use_partial_writer = "{target}" in out_filepath or "{chrom}" in out_filepath
290 partial_writers = dict()
291 total_outfile = None
292 if not use_partial_writer:
293 total_outfile = CMWriterUnpacker(exopen(out_filepath, "w"))
294 try:
295 total_outfile = total_outfile.__enter__()
296 except AttributeError:
297 pass
299 for chrom, start, end, total_start, total_end in self.roi_generator.regions(
300 bam_file=bamfile
301 ):
302 if "chr" in chrom: 302 ↛ 304line 302 didn't jump to line 304 because the condition on line 302 was always true
303 chrom = chrom.replace("chr", "")
304 if self.valid_chroms is not None and chrom not in self.valid_chroms:
305 continue
306 try:
307 regionStart, regionEnd = int(start), int(end)
308 except ValueError:
309 continue
311 starts = []
312 ends = []
313 for read in input_file.fetch(
314 prefix + chrom,
315 max(0, regionStart - self.protection_size - 1),
316 regionEnd + self.protection_size + 1,
317 ):
318 if read.is_duplicate or read.is_qcfail or read.is_unmapped: 318 ↛ 319line 318 didn't jump to line 319 because the condition on line 318 was never true
319 continue
320 if is_soft_clipped(read.cigartuples): 320 ↛ 321line 320 didn't jump to line 321 because the condition on line 320 was never true
321 continue
323 if read.is_paired:
324 if read.mate_is_unmapped: 324 ↛ 325line 324 didn't jump to line 325 because the condition on line 324 was never true
325 continue
326 if read.rnext != read.tid: 326 ↛ 327line 326 didn't jump to line 327 because the condition on line 326 was never true
327 continue
328 if read.is_read1 or (
329 read.is_read2
330 and read.pnext + read.qlen
331 < regionStart - self.protection_size - 1
332 ):
333 if read.isize == 0: 333 ↛ 334line 333 didn't jump to line 334 because the condition on line 333 was never true
334 continue
335 if (
336 downsample_ratio is not None
337 and random.random() >= downsample_ratio
338 ):
339 continue
340 lseq = abs(read.isize)
341 if (
342 self.min_insert_size is not None
343 and lseq < self.min_insert_size
344 ):
345 continue
346 if (
347 self.max_insert_size is not None
348 and lseq > self.max_insert_size
349 ):
350 continue
351 rstart = min(read.pos, read.pnext)
352 rend = rstart + lseq - 1
353 starts.append(rstart)
354 ends.append(rend)
355 else:
356 if (
357 downsample_ratio is not None
358 and random.random() >= downsample_ratio
359 ):
360 continue
361 rstart = read.pos
362 lseq = ref_aln_length(read.cigartuples)
363 if self.min_insert_size is not None and (
364 (lseq < self.min_insert_size) or (lseq > self.max_insert_size)
365 ):
366 continue
367 rend = rstart + lseq - 1 # end included
368 starts.append(rstart)
369 ends.append(rend)
370 n = regionEnd - regionStart + 1
371 if len(starts) > 0:
372 starts = np.array(starts)
373 ends = np.array(ends)
374 # Fragments fully spanning the window boundaries
375 span_start = starts + self.protection_size - regionStart
376 span_end = ends - self.protection_size - regionStart + 2
377 valid = span_end >= span_start
378 outside = np.zeros(n + 2, dtype=int)
379 np.add.at(
380 outside,
381 np.clip(span_start[valid] + 1, 0, n + 1),
382 1,
383 )
384 np.add.at(
385 outside,
386 np.clip(span_end[valid], 0, n + 1),
387 -1,
388 )
389 np.add.at(
390 outside,
391 np.clip(span_start[~valid] + 1, 0, n + 1),
392 -1,
393 )
394 np.add.at(
395 outside,
396 np.clip(span_end[~valid], 0, n + 1),
397 1,
398 )
400 outside_cum = np.cumsum(outside)[:-2]
402 # Fragments whose endpoints fall inside windows
403 all_ends = np.concatenate([starts, ends]) - regionStart
404 left = np.clip(all_ends - self.protection_size + 2, 0, n + 1)
405 right = np.clip(all_ends + self.protection_size + 1, 0, n + 1)
406 inside = np.zeros(n + 2, dtype=int)
407 np.add.at(inside, left, 1)
408 np.add.at(inside, right, -1)
409 inside_cum = np.cumsum(inside)[:-2]
411 wps = outside_cum - inside_cum
412 coverage = None
413 if compute_coverage:
414 coverage = np.zeros(n + 1, dtype=int)
415 np.add.at(
416 coverage,
417 np.clip(starts - regionStart, 0, n),
418 1,
419 )
420 np.add.at(
421 coverage,
422 np.clip(ends - regionStart + 1, 0, n),
423 -1,
424 )
425 coverage = np.cumsum(coverage)[:-1]
426 else:
427 outside_cum = np.zeros(n, dtype=int)
428 inside_cum = np.zeros(n, dtype=int)
429 wps = np.zeros(n, dtype=int)
430 coverage = None
431 if compute_coverage: 431 ↛ 432line 431 didn't jump to line 432 because the condition on line 431 was never true
432 coverage = np.zeros(n, dtype=int)
433 partial_outfile = None
434 if use_partial_writer:
435 formatted_out_filepath = out_filepath.format(
436 target=f"{chrom}_{total_start}_{total_end}",
437 chrom=chrom,
438 )
439 partial_outfile = partial_writers.get(
440 formatted_out_filepath,
441 CMWriterUnpacker(
442 exopen(formatted_out_filepath, "w", use_pigz=False)
443 ),
444 )
445 partial_writers[formatted_out_filepath] = partial_outfile
446 try:
447 partial_outfile = partial_outfile.__enter__()
448 except AttributeError:
449 pass
450 outfile = partial_outfile if use_partial_writer else total_outfile
451 st = np.arange(regionStart, regionEnd + 1)
452 en = st + 1 # add end coordinate
453 df = pd.DataFrame({"#chrom": chrom, "start": st, "end": en})
454 if compute_coverage:
455 df["coverage"] = coverage
456 if verbose_output:
457 df["outside"] = outside_cum
458 df["inside"] = inside_cum
459 df["wps"] = wps
460 df.to_csv(outfile, sep="\t", header=False, index=False)
462 if use_partial_writer:
463 for writer in partial_writers.values():
464 writer.close()
465 else:
466 total_outfile.close()
469def main():
470 """Main entry point for the command-line interface."""
471 from argparse import ArgumentParser
473 parser = ArgumentParser()
474 parser.add_argument(
475 "-i",
476 "--input",
477 dest="input",
478 help="Input BAM file",
479 required=True,
480 )
481 parser.add_argument(
482 "-o",
483 "--output",
484 dest="output",
485 help="The output file path for WPS results. If not provided, results will be printed to stdout.",
486 required=False,
487 )
488 parser.add_argument(
489 "-r",
490 "--regions",
491 dest="regions",
492 help="BED file with regions of interest (default: whole genome)",
493 default=None,
494 )
495 parser.add_argument(
496 "-w",
497 "--protection",
498 dest="protection",
499 help="Base pair protection window (default: 120)",
500 default=120,
501 type=int,
502 )
503 parser.add_argument(
504 "--min-insert-size",
505 dest="min_insert_size",
506 help="Minimum read length threshold to consider (Optional)",
507 default=None,
508 type=int,
509 )
510 parser.add_argument(
511 "--max-insert-size",
512 dest="max_insert_size",
513 help="Minimum read length threshold to consider (Optional)",
514 default=None,
515 type=int,
516 )
517 parser.add_argument(
518 "--downsample",
519 dest="downsample",
520 help="Ratio to down sample reads (default OFF)",
521 default=None,
522 type=float,
523 )
524 parser.add_argument(
525 "--chunk-size",
526 dest="chunk_size",
527 help="Chunk size for processing in pieces, in case of low memory (default 1e8)",
528 default=1e8,
529 type=int,
530 )
531 parser.add_argument(
532 "--valid-chroms",
533 dest="valid_chroms",
534 help="Comma-separated list of valid chromosomes to include (e.g., '1,2,3,X,Y') or 'canonical' for chromosomes 1-22, X, Y. Optional",
535 default=None,
536 type=str,
537 )
538 parser.add_argument(
539 "--verbose-output",
540 dest="verbose_output",
541 help="If provided, output will include separate counts for 'outside' and 'inside' along with WPS.",
542 action="store_true",
543 )
544 args = parser.parse_args()
545 valid_chroms = None
546 if args.valid_chroms == "canonical":
547 valid_chroms = [str(i) for i in range(1, 23)] + ["X", "Y"]
548 else:
549 valid_chroms = args.valid_chroms.split(",") if args.valid_chroms else None
550 optwps = WPS(
551 bed_file=args.regions,
552 protection_size=args.protection,
553 min_insert_size=args.min_insert_size,
554 max_insert_size=args.max_insert_size,
555 chunk_size=args.chunk_size,
556 valid_chroms=valid_chroms,
557 )
558 optwps.run(
559 bamfile=args.input,
560 out_filepath=args.output,
561 downsample_ratio=args.downsample,
562 verbose_output=args.verbose_output,
563 )
566if __name__ == "__main__": 566 ↛ 567line 566 didn't jump to line 567 because the condition on line 566 was never true
567 main()