Source code for pyms.Utils.Math

"""
Provides mathematical functions.
"""

################################################################################
#                                                                              #
#    PyMassSpec software for processing of mass-spectrometry data              #
#    Copyright (C) 2005-2012 Vladimir Likic                                    #
#    Copyright (C) 2019-2020 Dominic Davis-Foster                              #
#                                                                              #
#    is_float from 'jcamp' by Nathan Hagen								       #
# 	 https://github.com/nzhagen/jcamp										   #
# 	 Licensed under the X11 License											   #
#                                                                              #
#    This program is free software; you can redistribute it and/or modify      #
#    it under the terms of the GNU General Public License version 2 as         #
#    published by the Free Software Foundation.                                #
#                                                                              #
#    This program is distributed in the hope that it will be useful,           #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of            #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             #
#    GNU General Public License for more details.                              #
#                                                                              #
#    You should have received a copy of the GNU General Public License         #
#    along with this program; if not, write to the Free Software               #
#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.                 #
#                                                                              #
################################################################################

# stdlib
import math
from statistics import mean, median
from statistics import stdev as std
from typing import Iterable, List, Sequence, Union, overload

# 3rd party
import numpy

# this package
from pyms.Utils.Utils import is_number, is_sequence

__all__ = [
		"vector_by_step",
		"MAD",
		"rmsd",
		"mad_based_outlier",
		"percentile_based_outlier",
		"median_outliers",
		"is_float",
		"mean",
		"median",
		"std",
		]


[docs]def vector_by_step(start: float, stop: float, step: float) -> List[float]:
	"""
	Generates a list by using start, stop, and step values.

	:param start: Initial value
	:param stop: Max value
	:param step: Step

	:author: Vladimir Likic
	"""

	if not is_number(start) or not is_number(stop) or not is_number(step):
		raise TypeError("parameters 'start', 'stop', and 'step' must be numbers")

	v = []

	p = start
	while p < stop:
		v.append(p)
		p = p + step

	return v


[docs]def MAD(v: Union[Sequence, numpy.ndarray]) -> float:
	"""
	Median absolute deviation.

	:param v: List of values to calculate the median absolute deviation of.

	:return: median absolute deviation

	:author: Vladimir Likic
	"""

	if not is_sequence(v):
		raise TypeError("'v' must be a Sequence")

	m = median(v)
	m_list = []

	for xi in v:
		d = math.fabs(xi - m)
		m_list.append(d)

	mad = median(m_list) / 0.6745

	return mad


[docs]def rmsd(list1: Union[Sequence, numpy.ndarray], list2: Union[Sequence, numpy.ndarray]) -> float:
	"""
	Calculates RMSD for the 2 lists.

	:param list1: First data set
	:param list2: Second data set

	:return: RMSD value

	:authors: Qiao Wang, Andrew Isaac, Vladimir Likic
	"""

	if not is_sequence(list1):
		raise TypeError("'list1' must be a Sequence")

	if not is_sequence(list2):
		raise TypeError("'list2' must be a Sequence")

	total = 0.0
	for i in range(len(list1)):
		total = total + (list1[i] - list2[i])**2
	_rmsd = math.sqrt(total / len(list1))
	return _rmsd


[docs]def mad_based_outlier(data: Iterable[float], thresh: float = 3.5) -> Sequence[float]:
	"""
	Identify outliers using the median absolute deviation (MAD).

	:param data:
	:param thresh:

	:author: David Kainer
	:url: http://stackoverflow.com/questions/22354094/pythonic-way-of-detecting-outliers-in-one-dimensional-observation-data
	"""

	data = numpy.array(data)
	if len(data.shape) == 1:
		data = data[:, None]
	_median = numpy.nanmedian(data)
	diff = numpy.nansum((data - _median)**2, dtype=float, axis=-1)
	diff = numpy.sqrt(diff)
	med_abs_deviation = numpy.nanmedian(diff)

	modified_z_score = 0.6745 * diff / med_abs_deviation

	return modified_z_score > thresh


[docs]def percentile_based_outlier(data: Iterable[float], threshold: int = 95) -> Sequence[float]:
	"""
	Identify outliers using a percentile.

	:param data:
	:param threshold:

	:author: David Kainer
	:url: http://stackoverflow.com/questions/22354094/pythonic-way-of-detecting-outliers-in-one-dimensional-observation-data
	"""

	data = numpy.array(data)
	diff = (100 - threshold) / 2.0
	# nanpercentile only works in numpy 1.9 and up
	# minval, maxval = numpy.nanpercentile(data, [diff, 100 - diff])
	data = numpy.array(data)
	minval, maxval = numpy.percentile(numpy.compress(numpy.isnan(data) is False, data), (diff, 100 - diff))
	return (data < minval) | (data > maxval)


[docs]def median_outliers(data: Iterable[float], m: float = 2.5):  # noqa: MAN002
	"""
	Identify outliers using the median value.

	:param data:
	:param m:

	:author: David Kainer
	:author: eumiro (https://stackoverflow.com/users/449449/eumiro)
	:author: Benjamin Bannier (https://stackoverflow.com/users/176922/benjamin-bannier)
	:url: http://stackoverflow.com/questions/11686720/is-there-a-numpy-builtin-to-reject-outliers-from-a-list
	"""

	data = numpy.array(data)
	d = numpy.abs(data - numpy.nanmedian(data))
	mdev = numpy.nanmedian(d)
	s = d / mdev if mdev else 0.
	return s > m


@overload
def is_float(s: str) -> bool: ...


@overload
def is_float(s: List[str]) -> List[bool]: ...


[docs]def is_float(s: Union[str, List[str]]) -> Union[bool, List[bool]]:
	"""
	Test if a string, or list of strings, contains a numeric value(s).

	:param s: The string or list of strings to test.

	:return: A single boolean or list of boolean values indicating whether each input can be converted into a float.
	"""

	if isinstance(s, (tuple, list)):
		if not all(isinstance(i, str) for i in s):
			raise TypeError(f"Input {s} is not a list of strings")

		if len(s) == 0:
			raise ValueError(f'Input {s} is empty')
		else:
			return_list = [True] * len(s)
			for i in range(0, len(s)):
				try:
					float(s[i])
				except ValueError:
					return_list[i] = False
		return return_list
	else:
		if not isinstance(s, str):
			raise TypeError(f"Input '{s}' is not a string")

		try:
			float(s)
			return True
		except ValueError:
			return False