Gigabase or gigabyte

2018-07-25T16:31:11+02:00

Thanks for the great function! I am going to try it on Bowtie2-mapped bams

2019-08-07T16:29:06+02:00

Thanks for the very informative post.

I am a bit confused by this line of code:

params = zip([bam]*len(chromosomes), chromosomes)In In b

Specificially, what is the ‘In In b’ part?

Thanks again.

LikeLike

Reply

2019-08-07T18:58:15+02:00

Thanks for your question. I have no idea how that “ln ln b” ended up there, it totally shouldn’t be in the code and I’ll remove it.

Cheers,
Wouter

LikeLike

Reply

	import pysam

	def extractNMFromBam(bam):
	'''
	loop over a bam file and get the edit distance to the reference genome
	stored in the NM tag
	scale by aligned read length
	'''
	samfile = pysam.AlignmentFile(bam, "rb")
	return [read.get_tag("NM")/read.query_alignment_length for read in samfile.fetch()]

	import pysam
	import re

	def extractMDFromBam(bam):
	'''
	loop over a bam file and get the edit distance to the reference genome
	mismatches are stored in the MD tag
	scale by aligned read length
	'''
	samfile = pysam.AlignmentFile(bam, "rb")
	return [(read.query_alignment_length – sum([int(item) for item in re.split('[ACTG^]', read.get_tag("MD")) if not item == '']))/read.query_alignment_length for read in samfile.fetch()]

	import seaborn as sns
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from scipy import stats

	def makePlot(datadf):
	plot = sns.jointplot(
	x='editDistancesNM',
	y='editDistancesMD',
	data=datadf,
	kind="scatter",
	color="#4CB391",
	stat_func=stats.pearsonr,
	space=0,
	joint_kws={"s": 1},
	size=10)
	plot.savefig('EditDistancesCompared_scatter.png', format='png', dpi=1000)
	plot = sns.jointplot(
	x='editDistancesNM',
	y='editDistancesMD',
	data=datadf,
	kind="kde",
	color="#4CB391",
	stat_func=stats.pearsonr,
	space=0,
	size=10)
	plot.savefig('EditDistancesCompared_kde.png', format='png', dpi=1000)

	import pysam
	import re

	def extractMDFromBam(bam):
	'''
	loop over a bam file and get the edit distance to the reference genome
	mismatches are stored in the MD tag
	scale by aligned read length
	'''
	samfile = pysam.AlignmentFile(bam, "rb")
	return [sum([len(item) for item in re.split('[0-9^]', read.get_tag("MD"))]) / read.query_alignment_length for read in samfile.fetch()]

	import sys
	import re
	import pysam


	def extractDisagreement(bam):
	samfile = pysam.AlignmentFile(bam, "rb")
	for read in samfile.fetch():
	NMdef = read.get_tag("NM")/read.query_alignment_length
	MDdef = sum([len(item) for item in re.split('[0-9^]', read.get_tag("MD"))])/read.query_alignment_length
	if NMdef – MDdef > 0.2:
	print('\t'.join(
	[
	str(read.get_tag("NM")),
	str(sum([len(item) for item in re.split('[0-9^]', read.get_tag("MD"))])),
	read.get_tag("MD"),
	read.cigarstring,
	])
	)

Gigabase or gigabyte

Exploring bioinformatics

Getting the edit distance from a bam alignment: a journey

3 thoughts on “Getting the edit distance from a bam alignment: a journey”

Leave a comment Cancel reply

	import sys
	import os
	import re
	import seaborn as sns
	import pandas as pd
	import numpy as np
	from multiprocessing import Pool
	from scipy import stats
	import matplotlib.pyplot as plt
	import pysam


	def processBam(bam):
	'''
	Processing function: calls pool of worker functions
	to extract from a bam file two definitions of the edit distances to the reference genome scaled by read length
	Returned in a pandas DataFrame
	'''
	samfile = pysam.AlignmentFile(bam, "rb")
	if not samfile.has_index():
	pysam.index(bam)
	samfile = pysam.AlignmentFile(bam, "rb") # Need to reload the samfile after creating index
	chromosomes = samfile.references
	datadf = pd.DataFrame()
	pool = Pool(processes=12)
	params = zip([bam]*len(chromosomes), chromosomes)
	try:
	output = [results for results in pool.imap(extractFromBam, params)]
	except KeyboardInterrupt:
	print("Terminating worker threads")
	pool.terminate()
	pool.join()
	sys.exit()
	datadf["editDistancesNM"] = np.array([x for y in [elem[0] for elem in output] for x in y])
	datadf["editDistancesMD"] = np.array([x for y in [elem[1] for elem in output] for x in y])
	return datadf


	def extractFromBam(params):
	'''
	Worker function per chromosome
	loop over a bam file and create tuple with lists containing metrics:
	two definitions of the edit distances to the reference genome scaled by aligned read length
	'''
	bam, chromosome = params
	samfile = pysam.AlignmentFile(bam, "rb")
	editDistancesNM = []
	editDistancesMD = []
	for read in samfile.fetch(reference=chromosome, multiple_iterators=True):
	editDistancesNM.append(read.get_tag("NM")/read.query_alignment_length)
	editDistancesMD.append(
	(sum([len(item) for item in re.split('[0-9^]', read.get_tag("MD"))]) + # Parse MD string to get mismatches/deletions
	sum([item[1] for item in read.cigartuples if item[0] == 1])) # Parse cigar to get insertions
	/read.query_alignment_length)
	return (editDistancesNM, editDistancesMD)


	def makePlot(datadf):
	try:
	plot = sns.jointplot(
	x='editDistancesNM',
	y='editDistancesMD',
	data=datadf,
	kind="kde",
	color="#4CB391",
	stat_func=stats.pearsonr,
	space=0,
	size=10)
	plot.savefig('EditDistancesCompared_kde.png', format='png', dpi=1000)
	except: # throws an error with perfect correlation! 😀
	pass
	plot = sns.jointplot(
	x='editDistancesNM',
	y='editDistancesMD',
	data=datadf,
	kind="scatter",
	color="#4CB391",
	stat_func=stats.pearsonr,
	space=0,
	joint_kws={"s": 1},
	size=10)
	plot.savefig('EditDistancesCompared_scatter.png', format='png', dpi=1000)


	df = processBam(sys.argv[1])
	makePlot(df)

Share this:

Related

Share this:

3 thoughts on “Getting the edit distance from a bam alignment: a journey”

Leave a comment Cancel reply