Coverage for src/optwps/optwps.py: 84%

201 statements  

« prev     ^ index     » next       coverage.py v7.11.1, created at 2025-11-10 19:23 +0100

1#!/usr/bin/env python 

2""" 

3Fast Window Protection Score (WPS) Calculator for Cell-Free DNA Analysis. 

4 

5This module provides efficient computation of Window Protection Scores from BAM files, 

6designed for analyzing cell-free DNA fragmentation patterns and nucleosome positioning. 

7 

8The WPS algorithm identifies protected genomic regions by analyzing DNA fragment 

9coverage patterns. It counts fragments that span versus fragments whose endpoints 

10fall within a protection window around each genomic position. 

11 

12Example: 

13 Basic usage:: 

14 

15 from optwps import WPS 

16 

17 wps_calc = WPS(protection_size=120) 

18 wps_calc.run(bamfile='input.bam', out_filepath='output.tsv') 

19 

20 Creating separate output files:: 

21 

22 # Per chromosome 

23 wps_calc.run(bamfile='input.bam', out_filepath='wps_{chrom}.tsv') 

24 

25 # Per target region 

26 wps_calc.run(bamfile='input.bam', out_filepath='wps_{target}.tsv.gz') 

27 

28Note: 

29 - Op BAM Description 

30 - M 0 alignment match (can be a sequence match or mismatch) 

31 - I 1 insertion to the reference 

32 - D 2 deletion from the reference 

33 - N 3 skipped region from the reference 

34 - S 4 soft clipping (clipped sequences present in SEQ) 

35 - H 5 hard clipping (clipped sequences NOT present in SEQ) 

36 - P 6 padding (silent deletion from padded reference) 

37 - = 7 sequence match 

38 - X 8 sequence mismatch 

39 - pysam always uses 0-based coordinates, converting from 1-based of the sam files! 

40""" 

41 

42import os 

43import sys 

44import pysam 

45import random 

46import numpy as np 

47import pandas as pd 

48 

49from optwps.utils import exopen, is_soft_clipped, ref_aln_length 

50from tqdm.auto import tqdm 

51 

52 

53class ROIGenerator: 

54 """ 

55 Generate regions of interest (ROI) for processing. 

56 

57 This class generates genomic regions either from a BED file or from the entire 

58 genome in the BAM file. Regions are yielded in chunks for memory-efficient processing. 

59 

60 Args: 

61 bed_file (str, optional): Path to BED file containing regions to process. 

62 If None, the entire genome will be processed. Default: None 

63 chunk_size (int, optional): Size of chunks in base pairs for processing. 

64 Default: 1e8 (100 megabases) 

65 """ 

66 

67 def __init__(self, bed_file=None, chunk_size=1e8): 

68 self.bed_file = bed_file 

69 self.chunk_size = chunk_size 

70 

71 def regions(self, bam_file=None): 

72 """ 

73 Generate regions for processing. 

74 

75 Yields genomic regions either from a BED file or from the entire genome 

76 referenced in the BAM file. Regions are chunked according to chunk_size. 

77 

78 Args: 

79 bam_file (str, optional): Path to BAM file. Required if no BED file 

80 is provided to determine genome regions. Default: None 

81 

82 Yields: 

83 tuple: (chromosome, chunk_start, chunk_end, region_start, region_end) 

84 where chunk_start/chunk_end define the current chunk being processed 

85 and region_start/region_end define the full original region boundaries 

86 

87 Raises: 

88 ValueError: If neither bed_file nor bam_file can provide regions 

89 """ 

90 if (self.bed_file is None) or (not os.path.exists(self.bed_file)): 

91 input_file = pysam.Samfile(bam_file, "rb") 

92 nchunks = sum( 

93 (input_file.get_reference_length(chrom) - 1) // self.chunk_size + 1 

94 for chrom in input_file.references 

95 ) 

96 iterator = tqdm(total=nchunks, desc="Processing genome regions") 

97 for chrom in input_file.references: 

98 chrom_length = input_file.get_reference_length(chrom) 

99 region_start = 0 

100 while region_start < chrom_length: 

101 region_end = min(region_start + self.chunk_size, chrom_length) 

102 yield chrom, region_start, region_end, 0, chrom_length 

103 region_start = region_end 

104 iterator.update(1) 

105 iterator.close() 

106 else: 

107 # read number of lines in bed file 

108 nlines = sum(1 for _ in exopen(self.bed_file, "r")) 

109 with exopen(self.bed_file, "r") as bed: 

110 for line in tqdm(bed, total=nlines, desc="Processing BED regions"): 

111 chrom, start, end = line.split()[:3] 

112 chunk_start = int(start) 

113 chunk_end = min(chunk_start + self.chunk_size, int(end)) 

114 while chunk_start < int(end): 

115 yield chrom, chunk_start, chunk_end, int(start), int(end) 

116 chunk_start = chunk_end 

117 chunk_end = min(chunk_start + self.chunk_size, int(end)) 

118 

119 

120class CMWriterUnpacker: 

121 """ 

122 Wrapper for file handles to provide consistent write and close interface. 

123 

124 This class handles both context managers and direct file handles, providing 

125 a unified interface for writing data and closing files. It also prevents 

126 closing stdout to avoid unexpected behavior. 

127 

128 Args: 

129 cm: A context manager or file handle object 

130 

131 Attributes: 

132 cm: The original context manager 

133 handle: The unpacked file handle 

134 """ 

135 

136 def __init__(self, cm): 

137 self.cm = cm 

138 try: 

139 self.handle = self.cm.__enter__() 

140 except AttributeError: 

141 self.handle = self.cm 

142 

143 def write(self, data): 

144 """Write data to the file handle.""" 

145 self.handle.write(data) 

146 

147 def close(self): 

148 """Close the file handle, avoiding closure of stdout.""" 

149 try: 

150 if self.handle != sys.stdout: 

151 self.handle.close() 

152 except AttributeError: 

153 self.cm.__exit__(None, None, None) 

154 

155 

156class WPS: 

157 """ 

158 Window Protection Score (WPS) calculator for cell-free DNA analysis. 

159 

160 This class computes Window Protection Scores from aligned sequencing reads in BAM format. 

161 WPS quantifies the protection of DNA fragments around each genomic position, useful for 

162 identifying nucleosome positioning and other protected regions in cell-free DNA. 

163 

164 The algorithm calculates: 

165 - Outside score: fragments that completely span the protection window 

166 - Inside score: fragment endpoints falling within the protection window 

167 - WPS = outside - inside 

168 

169 Args: 

170 bed_file (str, optional): Path to BED file with regions to process. 

171 If None, processes entire genome. Default: None 

172 protection_size (int, optional): Total protection window size in base pairs. 

173 This value is divided by 2 to get the window on each side of the position. 

174 Default: 120 

175 min_insert_size (int, optional): Minimum insert/fragment size to include. 

176 If None, no minimum filter applied. Default: None 

177 max_insert_size (int, optional): Maximum insert/fragment size to include. 

178 If None, no maximum filter applied. Default: None 

179 valid_chroms (set, optional): Set of valid chromosome names to process. 

180 Default: chromosomes 1-22, X, Y 

181 chunk_size (float, optional): Region chunk size for processing. 

182 Default: 1e8 (100 Mb) 

183 

184 Attributes: 

185 bed_file (str): Path to BED file or None 

186 protection_size (int): Half of the protection window size 

187 valid_chroms (set): Set of valid chromosome names 

188 min_insert_size (int): Minimum fragment size filter 

189 max_insert_size (int): Maximum fragment size filter 

190 chunk_size (float): Chunk size for processing 

191 roi_generator (ROIGenerator): Region generator instance 

192 

193 Example: 

194 >>> wps = WPS(protection_size=120, min_insert_size=120, max_insert_size=180) 

195 >>> wps.run(bamfile='sample.bam', out_filepath='wps_output.tsv') 

196 >>> # Process specific chromosomes 

197 >>> wps = WPS(valid_chroms={'1', '2', '3', 'X', 'Y'}) 

198 >>> wps.run(bamfile='sample.bam', out_filepath='chr_wps.tsv') 

199 

200 Note: 

201 - Automatically filters duplicate, QC-failed, and unmapped reads 

202 - Handles both paired-end and single-end sequencing data 

203 - Supports downsampling for high-coverage samples 

204 - Can write to separate files per chromosome or target region using placeholders 

205 """ 

206 

207 def __init__( 

208 self, 

209 bed_file=None, 

210 protection_size=120, 

211 min_insert_size=None, 

212 max_insert_size=None, 

213 valid_chroms=set(map(str, list(range(1, 23)) + ["X", "Y"])), 

214 chunk_size=1e8, 

215 ): 

216 self.bed_file = bed_file 

217 self.protection_size = protection_size // 2 

218 if valid_chroms is not None: 218 ↛ 221line 218 didn't jump to line 221 because the condition on line 218 was always true

219 self.valid_chroms = [x.replace("chr", "") for x in valid_chroms] 

220 else: 

221 self.valid_chroms = None 

222 self.min_insert_size = min_insert_size 

223 self.max_insert_size = max_insert_size 

224 self.chunk_size = chunk_size 

225 self.roi_generator = ROIGenerator(bed_file=self.bed_file) 

226 

227 def __call__(self, *args, **kwargs): 

228 return self.run(*args, **kwargs) 

229 

230 def run( 

231 self, 

232 bamfile, 

233 out_filepath=None, 

234 downsample_ratio=None, 

235 compute_coverage=False, 

236 verbose_output=False, 

237 ): 

238 """ 

239 Calculate Window Protection Score for all regions and write to file. 

240 

241 Processes the BAM file to compute WPS values for each genomic position 

242 in the specified regions (or entire genome). Results are written to a 

243 tab-separated output file. 

244 

245 Args: 

246 bamfile (str): Path to input BAM file (must be sorted and indexed) 

247 out_filepath (str): Path to output TSV file, or stdout (default). 

248 If it contains a formatting substring {target} or {chrom}, it will be used to create per-target or 

249 per-chromosome files. Both can be used together. 

250 downsample_ratio (float, optional): Fraction of reads to randomly keep 

251 (0.0 to 1.0). Useful for high-coverage samples. Default: None (no downsampling) 

252 compute_coverage (bool, optional): Whether to compute and include base coverage 

253 verbose_output (bool, optional): Whether to include detailed counts 

254 

255 Returns: 

256 None: Results are written directly to out_filepath 

257 

258 Output Format: 

259 Tab-separated file with columns: 

260 - chromosome: Chromosome name (without 'chr' prefix) 

261 - start: Start position (0-based) 

262 - end: End position (1-based, start + 1) 

263 - base read coverage (if compute_coverage=True) 

264 - outside: Count of fragments spanning the protection window (if verbose_output=True) 

265 - inside: Count of fragment endpoints in protection window (if verbose_output=True) 

266 - wps: Window Protection Score (outside - inside) 

267 

268 Raises: 

269 FileNotFoundError: If bamfile does not exist 

270 ValueError: If downsample_ratio is not between 0 and 1 

271 

272 Example: 

273 >>> wps = WPS() 

274 >>> wps.run(bamfile='input.bam', out_filepath='output.tsv') 

275 >>> # With downsampling 

276 >>> wps.run(bamfile='input.bam', out_filepath='output.tsv', 

277 ... downsample_ratio=0.5) 

278 >>> # Creating separate files per chromosome 

279 >>> wps.run(bamfile='input.bam', out_filepath='wps_{chrom}.tsv') 

280 >>> # Creating separate files per target region 

281 >>> wps.run(bamfile='input.bam', out_filepath='wps_{target}.tsv.gz') 

282 """ 

283 if out_filepath is None: 

284 out_filepath = "stdout" 

285 input_file = pysam.Samfile(bamfile, "rb") 

286 prefix = ( 

287 "chr" if any(r.startswith("chr") for r in input_file.references) else "" 

288 ) 

289 use_partial_writer = "{target}" in out_filepath or "{chrom}" in out_filepath 

290 partial_writers = dict() 

291 total_outfile = None 

292 if not use_partial_writer: 

293 total_outfile = CMWriterUnpacker(exopen(out_filepath, "w")) 

294 try: 

295 total_outfile = total_outfile.__enter__() 

296 except AttributeError: 

297 pass 

298 

299 for chrom, start, end, total_start, total_end in self.roi_generator.regions( 

300 bam_file=bamfile 

301 ): 

302 if "chr" in chrom: 302 ↛ 304line 302 didn't jump to line 304 because the condition on line 302 was always true

303 chrom = chrom.replace("chr", "") 

304 if self.valid_chroms is not None and chrom not in self.valid_chroms: 

305 continue 

306 try: 

307 regionStart, regionEnd = int(start), int(end) 

308 except ValueError: 

309 continue 

310 

311 starts = [] 

312 ends = [] 

313 for read in input_file.fetch( 

314 prefix + chrom, 

315 max(0, regionStart - self.protection_size - 1), 

316 regionEnd + self.protection_size + 1, 

317 ): 

318 if read.is_duplicate or read.is_qcfail or read.is_unmapped: 318 ↛ 319line 318 didn't jump to line 319 because the condition on line 318 was never true

319 continue 

320 if is_soft_clipped(read.cigartuples): 320 ↛ 321line 320 didn't jump to line 321 because the condition on line 320 was never true

321 continue 

322 

323 if read.is_paired: 

324 if read.mate_is_unmapped: 324 ↛ 325line 324 didn't jump to line 325 because the condition on line 324 was never true

325 continue 

326 if read.rnext != read.tid: 326 ↛ 327line 326 didn't jump to line 327 because the condition on line 326 was never true

327 continue 

328 if read.is_read1 or ( 

329 read.is_read2 

330 and read.pnext + read.qlen 

331 < regionStart - self.protection_size - 1 

332 ): 

333 if read.isize == 0: 333 ↛ 334line 333 didn't jump to line 334 because the condition on line 333 was never true

334 continue 

335 if ( 

336 downsample_ratio is not None 

337 and random.random() >= downsample_ratio 

338 ): 

339 continue 

340 lseq = abs(read.isize) 

341 if ( 

342 self.min_insert_size is not None 

343 and lseq < self.min_insert_size 

344 ): 

345 continue 

346 if ( 

347 self.max_insert_size is not None 

348 and lseq > self.max_insert_size 

349 ): 

350 continue 

351 rstart = min(read.pos, read.pnext) 

352 rend = rstart + lseq - 1 

353 starts.append(rstart) 

354 ends.append(rend) 

355 else: 

356 if ( 

357 downsample_ratio is not None 

358 and random.random() >= downsample_ratio 

359 ): 

360 continue 

361 rstart = read.pos 

362 lseq = ref_aln_length(read.cigartuples) 

363 if self.min_insert_size is not None and ( 

364 (lseq < self.min_insert_size) or (lseq > self.max_insert_size) 

365 ): 

366 continue 

367 rend = rstart + lseq - 1 # end included 

368 starts.append(rstart) 

369 ends.append(rend) 

370 n = regionEnd - regionStart + 1 

371 if len(starts) > 0: 

372 starts = np.array(starts) 

373 ends = np.array(ends) 

374 # Fragments fully spanning the window boundaries 

375 span_start = starts + self.protection_size - regionStart 

376 span_end = ends - self.protection_size - regionStart + 2 

377 valid = span_end >= span_start 

378 outside = np.zeros(n + 2, dtype=int) 

379 np.add.at( 

380 outside, 

381 np.clip(span_start[valid] + 1, 0, n + 1), 

382 1, 

383 ) 

384 np.add.at( 

385 outside, 

386 np.clip(span_end[valid], 0, n + 1), 

387 -1, 

388 ) 

389 np.add.at( 

390 outside, 

391 np.clip(span_start[~valid] + 1, 0, n + 1), 

392 -1, 

393 ) 

394 np.add.at( 

395 outside, 

396 np.clip(span_end[~valid], 0, n + 1), 

397 1, 

398 ) 

399 

400 outside_cum = np.cumsum(outside)[:-2] 

401 

402 # Fragments whose endpoints fall inside windows 

403 all_ends = np.concatenate([starts, ends]) - regionStart 

404 left = np.clip(all_ends - self.protection_size + 2, 0, n + 1) 

405 right = np.clip(all_ends + self.protection_size + 1, 0, n + 1) 

406 inside = np.zeros(n + 2, dtype=int) 

407 np.add.at(inside, left, 1) 

408 np.add.at(inside, right, -1) 

409 inside_cum = np.cumsum(inside)[:-2] 

410 

411 wps = outside_cum - inside_cum 

412 coverage = None 

413 if compute_coverage: 

414 coverage = np.zeros(n + 1, dtype=int) 

415 np.add.at( 

416 coverage, 

417 np.clip(starts - regionStart, 0, n), 

418 1, 

419 ) 

420 np.add.at( 

421 coverage, 

422 np.clip(ends - regionStart + 1, 0, n), 

423 -1, 

424 ) 

425 coverage = np.cumsum(coverage)[:-1] 

426 else: 

427 outside_cum = np.zeros(n, dtype=int) 

428 inside_cum = np.zeros(n, dtype=int) 

429 wps = np.zeros(n, dtype=int) 

430 coverage = None 

431 if compute_coverage: 431 ↛ 432line 431 didn't jump to line 432 because the condition on line 431 was never true

432 coverage = np.zeros(n, dtype=int) 

433 partial_outfile = None 

434 if use_partial_writer: 

435 formatted_out_filepath = out_filepath.format( 

436 target=f"{chrom}_{total_start}_{total_end}", 

437 chrom=chrom, 

438 ) 

439 partial_outfile = partial_writers.get( 

440 formatted_out_filepath, 

441 CMWriterUnpacker( 

442 exopen(formatted_out_filepath, "w", use_pigz=False) 

443 ), 

444 ) 

445 partial_writers[formatted_out_filepath] = partial_outfile 

446 try: 

447 partial_outfile = partial_outfile.__enter__() 

448 except AttributeError: 

449 pass 

450 outfile = partial_outfile if use_partial_writer else total_outfile 

451 st = np.arange(regionStart, regionEnd + 1) 

452 en = st + 1 # add end coordinate 

453 df = pd.DataFrame({"#chrom": chrom, "start": st, "end": en}) 

454 if compute_coverage: 

455 df["coverage"] = coverage 

456 if verbose_output: 

457 df["outside"] = outside_cum 

458 df["inside"] = inside_cum 

459 df["wps"] = wps 

460 df.to_csv(outfile, sep="\t", header=False, index=False) 

461 

462 if use_partial_writer: 

463 for writer in partial_writers.values(): 

464 writer.close() 

465 else: 

466 total_outfile.close() 

467 

468 

469def main(): 

470 """Main entry point for the command-line interface.""" 

471 from argparse import ArgumentParser 

472 

473 parser = ArgumentParser() 

474 parser.add_argument( 

475 "-i", 

476 "--input", 

477 dest="input", 

478 help="Input BAM file", 

479 required=True, 

480 ) 

481 parser.add_argument( 

482 "-o", 

483 "--output", 

484 dest="output", 

485 help="The output file path for WPS results. If not provided, results will be printed to stdout.", 

486 required=False, 

487 ) 

488 parser.add_argument( 

489 "-r", 

490 "--regions", 

491 dest="regions", 

492 help="BED file with regions of interest (default: whole genome)", 

493 default=None, 

494 ) 

495 parser.add_argument( 

496 "-w", 

497 "--protection", 

498 dest="protection", 

499 help="Base pair protection window (default: 120)", 

500 default=120, 

501 type=int, 

502 ) 

503 parser.add_argument( 

504 "--min-insert-size", 

505 dest="min_insert_size", 

506 help="Minimum read length threshold to consider (Optional)", 

507 default=None, 

508 type=int, 

509 ) 

510 parser.add_argument( 

511 "--max-insert-size", 

512 dest="max_insert_size", 

513 help="Minimum read length threshold to consider (Optional)", 

514 default=None, 

515 type=int, 

516 ) 

517 parser.add_argument( 

518 "--downsample", 

519 dest="downsample", 

520 help="Ratio to down sample reads (default OFF)", 

521 default=None, 

522 type=float, 

523 ) 

524 parser.add_argument( 

525 "--chunk-size", 

526 dest="chunk_size", 

527 help="Chunk size for processing in pieces, in case of low memory (default 1e8)", 

528 default=1e8, 

529 type=int, 

530 ) 

531 parser.add_argument( 

532 "--valid-chroms", 

533 dest="valid_chroms", 

534 help="Comma-separated list of valid chromosomes to include (e.g., '1,2,3,X,Y') or 'canonical' for chromosomes 1-22, X, Y. Optional", 

535 default=None, 

536 type=str, 

537 ) 

538 parser.add_argument( 

539 "--verbose-output", 

540 dest="verbose_output", 

541 help="If provided, output will include separate counts for 'outside' and 'inside' along with WPS.", 

542 action="store_true", 

543 ) 

544 args = parser.parse_args() 

545 valid_chroms = None 

546 if args.valid_chroms == "canonical": 

547 valid_chroms = [str(i) for i in range(1, 23)] + ["X", "Y"] 

548 else: 

549 valid_chroms = args.valid_chroms.split(",") if args.valid_chroms else None 

550 optwps = WPS( 

551 bed_file=args.regions, 

552 protection_size=args.protection, 

553 min_insert_size=args.min_insert_size, 

554 max_insert_size=args.max_insert_size, 

555 chunk_size=args.chunk_size, 

556 valid_chroms=valid_chroms, 

557 ) 

558 optwps.run( 

559 bamfile=args.input, 

560 out_filepath=args.output, 

561 downsample_ratio=args.downsample, 

562 verbose_output=args.verbose_output, 

563 ) 

564 

565 

566if __name__ == "__main__": 566 ↛ 567line 566 didn't jump to line 567 because the condition on line 566 was never true

567 main()