Source code for pyms.DPA.PairwiseAlignment

"""
Classes for peak alignment by dynamic programming.
"""

################################################################################
#                                                                              #
#    PyMassSpec software for processing of mass-spectrometry data              #
#    Copyright (C) 2005-2012 Vladimir Likic                                    #
#    Copyright (C) 2019-2020 Dominic Davis-Foster                              #
#                                                                              #
#    This program is free software; you can redistribute it and/or modify      #
#    it under the terms of the GNU General Public License version 2 as         #
#    published by the Free Software Foundation.                                #
#                                                                              #
#    This program is distributed in the hope that it will be useful,           #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of            #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             #
#    GNU General Public License for more details.                              #
#                                                                              #
#    You should have received a copy of the GNU General Public License         #
#    along with this program; if not, write to the Free Software               #
#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.                 #
#                                                                              #
################################################################################

# stdlib
import copy
import functools
import math
from typing import Dict, List

# 3rd party
import numpy

# this package
from pyms.Peak import Peak

try:
	# 3rd party
	from mpi4py import MPI  # type: ignore[import]
except ModuleNotFoundError:
	pass

# this package
from pyms.DPA.Alignment import Alignment
from pyms.DPA.clustering import treecluster
from pyms.Utils.Utils import is_sequence_of

__all__ = [
		"PairwiseAlignment",
		"align",
		"score_matrix",
		"dp",
		"position_similarity",
		"merge_alignments",
		"alignment_similarity",
		"alignment_compare",
		"score_matrix_mpi",
		"align_with_tree",  # "align_with_tree_mpi",
		]


[docs]class PairwiseAlignment:
	"""
	Models pairwise alignment of alignments.

	:param alignments: A list of alignments.
	:param D: Retention time tolerance parameter (in seconds) for pairwise alignments.
	:param gap: Gap parameter for pairwise alignments.

	:authors: Woon Wai Keen, Vladimir Likic
	"""

	def __init__(self, alignments: List[Alignment], D: float, gap: float):
		if not is_sequence_of(alignments, Alignment):
			raise TypeError("'alignments' must be a Sequence of Alignment objects")

		if not isinstance(D, float):
			raise TypeError("'D' must be a float")

		if not isinstance(gap, float):
			raise TypeError("'gap' must be a float")

		self.alignments = alignments
		self.D = D
		self.gap = gap

		self._sim_matrix()
		self._dist_matrix()
		self._guide_tree()

	def _sim_matrix(self) -> None:
		"""
		Calculates the similarity matrix for the set of alignments.

		:authors: Woon Wai Keen, Vladimir Likic
		"""

		n = len(self.alignments)

		total_n = n * (n - 1) // 2

		print(f" Calculating pairwise alignments for {n:d} alignments (D={self.D:.2f}, gap={self.gap:.2f})")

		self.sim_matrix = numpy.zeros((n, n), dtype='f')

		# Could we parallelize this pairwise alignment loop??
		for i in range(n - 1):
			for j in range(i + 1, n):
				ma = align(self.alignments[i], self.alignments[j], self.D, self.gap)
				self.sim_matrix[i, j] = self.sim_matrix[j, i] = ma.similarity
				total_n = total_n - 1
				print(f" -> {total_n:d} pairs remaining")

	def _dist_matrix(self) -> None:
		"""
		Converts similarity matrix into a distance matrix.

		:authors: Woon Wai Keen, Vladimir Likic
		"""

		# change similarity matrix entries (i,j) to max{matrix}-(i,j)
		sim_max = numpy.max(numpy.ravel(self.sim_matrix))
		self.dist_matrix = sim_max - self.sim_matrix

		# set diagonal elements of the similarity matrix to zero
		for i in range(len(self.dist_matrix)):
			self.dist_matrix[i, i] = 0

	def _guide_tree(self) -> None:
		"""
		Build a guide tree from the distance matrix.

		:authors: Woon Wai Keen, Vladimir Likic
		"""

		n = len(self.dist_matrix)

		print(f" -> Clustering {n * (n - 1):d} pairwise alignments.", end='')
		self.tree = treecluster(data=None, distancematrix=self.dist_matrix, method='a')
		print("Done")


[docs]def align(a1: Alignment, a2: Alignment, D: float, gap: float) -> Alignment:
	"""
	Aligns two alignments.

	:param a1: The first alignment
	:param a2: The second alignment
	:param D: Retention time tolerance in seconds.
	:param gap: Gap penalty

	:return: Aligned alignments

	:authors: Woon Wai Keen, Vladimir Likic
	"""

	# comm = MPI.COMM_WORLD
	# rank = comm.Get_rank()

	# calculate score matrix for two alignments
	M = score_matrix(a1, a2, D)
	# print("calculated score matrix on rank", rank)

	# run dynamic programming
	result = dp(M, gap)

	# make composite alignment from the results
	ma = merge_alignments(a1, a2, result["trace"])

	# calculate the similarity score
	ma.similarity = alignment_similarity(result["trace"], M, gap)

	return ma


[docs]def score_matrix(a1: Alignment, a2: Alignment, D: float) -> numpy.ndarray:
	"""
	Calculates the score matrix between two alignments.

	:param a1: The first alignment.
	:param a2: The second alignment.
	:param D: Retention time tolerance in seconds.

	:return: Aligned alignments.

	:authors: Qiao Wang, Andrew Isaac
	"""

	# sim_score = 0

	score_matrix = numpy.zeros((len(a1.peakalgt), len(a2.peakalgt)))

	for i, algt1pos in enumerate(a1.peakalgt):
		for j, algt2pos in enumerate(a2.peakalgt):
			sim_score = position_similarity(algt1pos, algt2pos, D)
			score_matrix[i][j] = sim_score

	return score_matrix


[docs]def dp(S, gap_penalty: float) -> Dict:
	"""
	Solves optimal path in score matrix based on global sequence alignment.

	:param S: Score matrix
	:param gap_penalty: Gap penalty

	:return: A dictionary of results

	:author: Tim Erwin
	"""

	# comm = MPI.COMM_WORLD
	# rank = comm.Get_rank()
	# print " In DP.py, I am rank", rank

	try:
		row_length = len(S[:, 0])
	except IndexError:
		raise IndexError("Zero length alignment found: Samples with no peaks cannot be aligned")

	col_length = len(S[0, :])

	# D contains the score of the optimal alignment
	D = numpy.zeros((row_length + 1, col_length + 1), dtype='d')

	for i in range(1, row_length + 1):
		D[i, 0] = gap_penalty * i
	for j in range(1, col_length + 1):
		D[0, j] = gap_penalty * j

	D[0, 0] = 0.0
	D[1:(row_length + 1), 1:(col_length + 1)] = S.copy()

	# Directions for trace
	# 0 - match               (move diagonal)
	# 1 - peaks1 has no match (move up)
	# 2 - peaks2 has no match (move left)
	# 3 - stop
	trace_matrix = numpy.zeros((row_length + 1, col_length + 1))
	trace_matrix[:, 0] = 1
	trace_matrix[0, :] = 2
	trace_matrix[0, 0] = 3

	for i in range(1, row_length + 1):
		for j in range(1, col_length + 1):
			#
			# Needleman-Wunsch Algorithm assuming a score function S(x,x)=0
			#
			#              | D[i-1,j-1] + S(i,j)
			# D[i,j] = min | D(i-1,j] + gap
			#              | D[i,j-1] + gap
			#

			darray = [D[i - 1, j - 1] + S[i - 1, j - 1], D[i - 1, j] + gap_penalty, D[i, j - 1] + gap_penalty]
			D[i, j] = min(darray)
			# Store direction in trace matrix
			trace_matrix[i, j] = darray.index(D[i, j])

	# Trace back from bottom right
	trace = []
	matches = []
	i = row_length
	j = col_length
	direction = trace_matrix[i, j]
	p = [row_length - 1]
	q = [col_length - 1]

	while direction != 3:

		if direction == 0:  # Match
			i = i - 1
			j = j - 1
			matches.append([i, j])
		elif direction == 1:  # peaks1 has no match
			i = i - 1
		elif direction == 2:  # peaks2 has no match
			j = j - 1
		p.append(i - 1)
		q.append(j - 1)
		trace.append(direction)
		direction = trace_matrix[i, j]

	# remove 'stop' entry
	p.pop()
	q.pop()
	# reverse the trace back
	p.reverse()
	q.reverse()
	trace.reverse()
	matches.reverse()

	return {'p': p, 'q': q, "trace": trace, "matches": matches, 'D': D, "phi": trace_matrix}


[docs]def position_similarity(pos1: List[Peak], pos2: List[Peak], D: float) -> float:
	"""
	Calculates the similarity between the two alignment positions.

	A score of 0 is best and 1 is worst.

	:param pos1: The position of the first alignment.
	:param pos2: The position of the second alignment.
	:param D: Retention time tolerance in seconds.

	:return: The similarity value for the current position.

	:authors: Qiao Wang, Vladimir Likic, Andrew Isaac
	"""

	score = 0.0
	count = 0

	# Attempt to speed up by only calculating 'in-range' values
	# set tolerance to 1/1000
	_TOL = 0.001
	cutoff = D * math.sqrt(-2.0 * math.log(_TOL))

	for a in pos1:
		if a is not None:
			aspec = a.mass_spectrum.mass_spec
			art = a.rt
			once = True

			for b in pos2:
				if b is not None:
					brt = b.rt
					# in range?

					if abs(art - brt) > cutoff:
						score += 1.0  # NB score of 1 is worst
					else:
						# Once per b-loop
						if once:
							mass_spect1 = numpy.array(aspec, dtype='d')
							mass_spect1_sum = numpy.sum(mass_spect1**2, axis=0)
							once = False

						bspec = b.mass_spectrum.mass_spec
						mass_spect2 = numpy.array(bspec, dtype='d')
						mass_spect2_sum = numpy.sum(mass_spect2**2, axis=0)

						try:
							top = numpy.dot(mass_spect1, mass_spect2)
						except ValueError:
							raise ValueError(
									"""Mass Spectra are of different lengths.
Use `IntensityMatrix.crop_mass()` to set same length for all Mass Spectra"""
									)

						bot = numpy.sqrt(mass_spect1_sum * mass_spect2_sum)
						if bot > 0:
							cos = top / bot
						else:
							cos = 0
						rtime = numpy.exp(-((art - brt) / float(D))**2 / 2.0)
						score = score + (1.0 - (cos * rtime))
					count = count + 1

	if count == 0:
		score = 1.0  # NB score of 1 is worst
	else:
		score = score / float(count)

	return score


[docs]def merge_alignments(A1: Alignment, A2: Alignment, traces) -> Alignment:
	"""
	Merges two alignments with gaps added in from DP traceback.

	:param A1: First alignment.
	:param A2: Second alignment.
	:param traces: DP traceback.

	:return: A single alignment from ``A1`` and ``A2``.

	:authors: Woon Wai Keen, Vladimir Likic, Qiao Wang
	"""

	# Create object to hold new merged alignment and fill in its expr_codes
	ma = Alignment(None)
	ma.expr_code = A1.expr_code + A2.expr_code

	# create empty lists of dimension |A1| + |A2|
	dimension = len(A1.peakpos) + len(A2.peakpos)
	merged: List[List[Peak]] = [[] for _ in range(dimension)]

	idx1 = idx2 = 0

	# trace can either be 0, 1, or 2
	# if it is 0, there are no gaps. otherwise, if it is 1 or 2,
	# there is a gap in A2 or A1 respectively.

	for trace in traces:

		if trace in {0, 1}:
			for i, _ in enumerate(A1.peakpos):
				merged[i].append(A1.peakpos[i][idx1])

			idx1 = idx1 + 1

		elif trace == 2:
			for i, _ in enumerate(A1.peakpos):
				merged[i].append(None)  # type: ignore[arg-type]

		# ---

		if trace in {0, 2}:
			for j, peak in enumerate(A2.peakpos):
				merged[1 + i + j].append(peak[idx2])

			idx2 = idx2 + 1

		elif trace == 1:
			for j, _ in enumerate(A2.peakpos):
				merged[1 + i + j].append(None)  # type: ignore[arg-type]

	ma.peakalgt = numpy.transpose(merged)  # type: ignore[assignment, arg-type]
	# sort according to average peak
	ma.peakalgt = list(ma.peakalgt)
	ma.peakalgt.sort(key=functools.cmp_to_key(alignment_compare))
	ma.peakpos = numpy.transpose(ma.peakalgt)  # type: ignore[arg-type, assignment]

	return ma


[docs]def alignment_similarity(traces, score_matrix, gap: float) -> float:
	"""
	Calculates similarity score between two alignments (new method).

	:param traces: Traceback from DP algorithm.
	:param score_matrix: Score matrix of the two alignments.
	:param gap: Gap penalty.

	:return: Similarity score (i.e. more similar => higher score)

	:authors: Woon Wai Keen, Vladimir Likic
	"""

	score_matrix = 1.0 - score_matrix
	similarity = 0.
	idx1 = idx2 = 0

	# Trace can either be 0, 1, or 2
	# If it is 0, there is a match and we add to the sum the score between
	# these two aligned peaks.
	#
	# Otherwise, if it is 1 or 2, and there is a gap in A2 or A1
	# respectively. We then subtract the gap penalty from the sum.
	for trace in traces:
		if trace == 0:
			similarity = similarity + score_matrix[idx1][idx2]
			idx1 = idx1 + 1
			idx2 = idx2 + 1
		elif trace == 1:
			similarity = similarity - gap
			idx1 = idx1 + 1
		elif trace == 2:
			similarity = similarity - gap
			idx2 = idx2 + 1

	return similarity


[docs]def alignment_compare(x, y) -> int:
	"""
	A helper function for sorting peak positions in a alignment.

	:param x:
	:param y:
	"""

	x = [_.rt for _ in filter(None, x)]
	y = [_.rt for _ in filter(None, y)]

	avg_x = numpy.sum(x) / len(x)
	avg_y = numpy.sum(y) / len(y)

	if avg_x < avg_y:
		return -1
	else:
		return 1


[docs]def score_matrix_mpi(a1: Alignment, a2: Alignment, D: float):  # TODO: return type
	"""
	Calculates the score matrix between two alignments.

	:param a1: The first alignment.
	:param a2: The second alignment.
	:param D: Retention time tolerance in seconds.

	:return: Aligned alignments

	:authors: Qiao Wang, Andrew Isaac
	"""

	# sim_score = 0

	comm = MPI.COMM_WORLD
	rank = comm.Get_rank()
	size = comm.Get_size()

	portion = int(float(len(a1.peakalgt)) / size)

	# print("length of a1.peakalgt =", len(a1.peakalgt))
	# print("portion size = ", portion)

	# if rank == 0:
	score_matrix = numpy.zeros((len(a1.peakalgt), len(a2.peakalgt)))

	if rank < size - 1:  # if it's not the last slice
		score_matrix_part = numpy.zeros((portion, len(a2.peakalgt)))
		a1_part = a1.peakalgt[rank * portion:(rank + 1) * portion]
	else:  # if it's the last strip, prob not full portion
		score_matrix_part = numpy.zeros((
				len(a1.peakalgt) - (rank * portion),
				len(a2.peakalgt),
				))
		a1_part = a1.peakalgt[rank * portion:len(a1.peakalgt)]

	for i, algt1pos in enumerate(a1_part):
		for j, algt2pos in enumerate(a2.peakalgt):
			sim_score = position_similarity(algt1pos, algt2pos, D)
			score_matrix_part[i][j] = sim_score

	if rank == 0:
		score_matrix[0:portion] = score_matrix_part
		for i in range(1, size):
			if i == size - 1:
				recv_buffer = numpy.zeros((len(a1.peakalgt) - (i * portion), len(a2.peakalgt)))
				comm.Recv(recv_buffer, i)
				score_matrix[i * portion:len(a1.peakalgt)] = recv_buffer
			else:
				recv_buffer = numpy.zeros((portion, len(a2.peakalgt)))
				comm.Recv(recv_buffer, i)
				score_matrix[i * portion:(i + 1) * portion] = recv_buffer

	# print("I am rank 0, done!", score_matrix)

	else:
		# all other process send their result
		comm.Send(score_matrix_part)

	outputs = []
	if rank == 0:
		for rank in range(size):
			# print rank
			outputs.append(score_matrix)

	score_matrix = comm.scatter(outputs, root=0)

	# print("before return, rank", rank)
	return score_matrix


[docs]def align_with_tree(T: PairwiseAlignment, min_peaks: int = 1) -> Alignment:
	"""
	Aligns a list of alignments using the supplied guide tree.

	:param T: The pairwise alignment object.
	:param min_peaks:

	:return: The final alignment consisting of aligned input alignments.

	:authors: Woon Wai Keen, Vladimir Likic
	"""

	print(f" Aligning {len(T.alignments):d} items with guide tree (D={T.D:.2f}, gap={T.gap:.2f})")

	# For everything else, we align according to the guide tree provided by
	# Pycluster. From Pycluster documentation:
	#   Each item and subnode is represented by an integer. For hierarchical
	#   clustering of n items, we number the original items {0, ... , n-1},
	#   nodes are numbered {-1, ... , -(n-1)}. Note that the number of nodes
	#   is one less than the number of items.

	# extend As to length 2n to hold the n items, n-1 nodes, and 1 root
	As_padding = [None for _ in range(len(T.alignments))]
	As: List[Alignment] = copy.deepcopy(T.alignments) + As_padding  # type: ignore[operator]

	# align the alignments into positions -1, ... ,-(n-1)
	total = len(T.tree)
	index = 0

	for node in T.tree[:]:
		index = index - 1
		As[index] = align(As[node.left], As[node.right], T.D, T.gap)
		total = total - 1
		print(f" -> {total:d} item(s) remaining")

	# the final alignment is in the root. Filter min peaks and return
	final_algt: Alignment = As[index]

	# useful for within state alignment only
	if min_peaks > 1:
		final_algt.filter_min_peaks(min_peaks)

	return final_algt


#
# def align_with_tree_mpi(T: Alignment, min_peaks: int = 1) -> Alignment:
# 	"""
# 	Aligns a list of alignments using the supplied guide tree
#
# 	:param T: The pairwise alignment object
# 	:param min_peaks:
#
# 	:return: The final alignment consisting of aligned input alignments
#
# 	:authors: Woon Wai Keen, Vladimir Likic
# 	"""
#
# 	try:
# 		rank = MPI.COMM_WORLD.Get_rank()
# 	except:
# 		rank = 0
#
# 	if rank == 0:
# 		print(f" Aligning {len(T.alignments):d} items with guide tree (D={T.D:.2f}, gap={T.gap:.2f})")
#
# 	# For everything else, we align according to the guide tree provided by
# 	# Pycluster. From Pycluster documentation:
# 	#   Each item and subnode is represented by an integer. For hierarchical
# 	#   clustering of n items, we number the original items {0, ... , n-1},
# 	#   nodes are numbered {-1, ... , -(n-1)}. Note that the number of nodes
# 	#   is one less than the number of items.
#
# 	# extend As to length 2n to hold the n items, n-1 nodes, and 1 root
# 	As = copy.deepcopy(T.alignments) + [None for _ in range(len(T.alignments))]
#
# 	# align the alignments into positions -1, ... ,-(n-1)
# 	total = len(T.tree)
# 	index = 0
#
# 	for node in T.tree[:]:
# 		index = index - 1
# 		As[index] = align(As[node.left], As[node.right], T.D, T.gap)
# 		total = total - 1
# 		if rank == 0:
# 			print(f" -> {total:d} item(s) remaining")
#
# 	# the final alignment is in the root. Filter min peaks and return
# 	final_algt = As[index]
#
# 	# useful for within state alignment only
# 	if min_peaks > 1:
# 		final_algt.filter_min_peaks(min_peaks)
#
# 	return final_algt