Source code for pyms.Gapfill.Function

"""
Functions to fill missing peak objects.
"""

################################################################################
#                                                                              #
#    PyMassSpec software for processing of mass-spectrometry data              #
#    Copyright (C) 2005-2012 Vladimir Likic                                    #
#    Copyright (C) 2019-2020 Dominic Davis-Foster                              #
#                                                                              #
#    This program is free software; you can redistribute it and/or modify      #
#    it under the terms of the GNU General Public License version 2 as         #
#    published by the Free Software Foundation.                                #
#                                                                              #
#    This program is distributed in the hope that it will be useful,           #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of            #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             #
#    GNU General Public License for more details.                              #
#                                                                              #
#    You should have received a copy of the GNU General Public License         #
#    along with this program; if not, write to the Free Software               #
#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.                 #
#                                                                              #
################################################################################

# stdlib
import pathlib
from typing import List, Optional

# 3rd party
import pandas  # type: ignore[import-untyped]
from domdf_python_tools.typing import PathLike
from enum_tools import IntEnum

# this package
from pyms.BillerBiemann import get_maxima_list_reduced
from pyms.Gapfill.Class import MissingPeak, Sample
from pyms.IntensityMatrix import build_intensity_matrix_i
from pyms.Noise.SavitzkyGolay import savitzky_golay
from pyms.Peak.Function import ion_area
from pyms.TopHat import tophat
from pyms.Utils.IO import prepare_filepath
from pyms.Utils.Utils import is_path

__all__ = [
		"file2dataframe",
		"missing_peak_finder",
		"mp_finder",
		"write_filled_csv",
		"write_filled_rt_csv",
		"MZML",
		"NETCDF",
		"MissingPeakFiletype",
		]


[docs]class MissingPeakFiletype(IntEnum):
	"""
	Flag to indicate the filetype for :func:`pyms.Gapfill.Function.missing_peak_finder`.

	.. versionadded:: 2.3.0
	"""

	MZML = 1
	NETCDF = 2


MZML = MissingPeakFiletype.MZML
NETCDF = MissingPeakFiletype.NETCDF


[docs]def file2dataframe(file_name: PathLike) -> pandas.DataFrame:
	"""
	Convert a .csv file to a pandas DataFrame.

	:param file_name: CSV file to read.

	:authors: Jairus Bowne, Sean O'Callaghan, Dominic Davis-Foster (pathlib support)

	.. versionadded:: 2.3.0
	"""

	if not is_path(file_name):
		raise TypeError("'file_name' must be a string or a PathLike object")

	file_name = prepare_filepath(file_name, mkdirs=False)

	return pandas.read_csv(
			file_name,
			delimiter=',',
			quotechar='"',
			header=0,
			)


[docs]def missing_peak_finder(
		sample: Sample,
		file_name: str,
		points: int = 3,
		null_ions: Optional[List] = None,
		crop_ions: Optional[List] = None,
		threshold: int = 1000,
		rt_window: float = 1,
		filetype: MissingPeakFiletype = MZML,
		) -> None:
	r"""
	Integrates raw data around missing peak locations to fill ``NA``\s in the data matrix.

	:param sample: The sample object containing missing peaks
	:param file_name: Name of the raw data file
	:param points: Peak finding - Peak if maxima over 'points' number of scans.
	:param null_ions: Ions to be deleted in the matrix.
	:default null_ions: ``[73, 147]``
	:param crop_ions: Range of Ions to be considered.
	:default crop_ions: ``[50, 540]``
	:param threshold: Minimum intensity of IonChromatogram allowable to fill.
	:param rt_window: Window in seconds around average RT to look for.
	:param filetype:

	:author: Sean O'Callaghan
	"""

	if not null_ions:
		null_ions = [73, 147]
	if not crop_ions:
		crop_ions = [50, 540]

	# TODO: some error checks on null and crop ions

	# TODO: a for root,files,dirs in os.path.walk(): loop
	print("Sample:", sample.name, "File:", file_name)

	if filetype == NETCDF:
		# this package
		from pyms.GCMS.IO.ANDI import ANDI_reader
		data = ANDI_reader(file_name)

	elif filetype == MZML:
		# this package
		from pyms.GCMS.IO.MZML import mzML_reader
		data = mzML_reader(file_name)

	else:
		print("file type not valid")

	# build integer intensity matrix
	im = build_intensity_matrix_i(data)

	for null_ion in null_ions:
		im.null_mass(null_ion)

	im.crop_mass(crop_ions[0], crop_ions[1])

	# get the size of the intensity matrix
	n_scan, n_mz = im.size

	# smooth data
	for ii in range(n_mz):
		ic = im.get_ic_at_index(ii)
		ic1 = savitzky_golay(ic, points)
		ic_smooth = savitzky_golay(ic1, points)
		ic_base = tophat(ic_smooth, struct="1.5m")
		im.set_ic_at_index(ii, ic_base)

	for mp in sample.missing_peaks:

		mp_rt = mp.rt
		common_ion = mp.common_ion
		qual_ion_1 = float(mp.qual_ion1)
		qual_ion_2 = float(mp.qual_ion2)

		ci_ion_chrom = im.get_ic_at_mass(common_ion)
		print("ci = ", common_ion)
		qi1_ion_chrom = im.get_ic_at_mass(qual_ion_1)
		print("qi1 = ", qual_ion_1)
		qi2_ion_chrom = im.get_ic_at_mass(qual_ion_2)
		print("qi2 = ", qual_ion_2)
		######
		# Integrate the CI around that particular RT
		#######

		# Convert time to points
		# How long between scans?

		points_1 = ci_ion_chrom.get_index_at_time(float(mp_rt))
		points_2 = ci_ion_chrom.get_index_at_time(float(mp_rt) - rt_window)
		print("rt_window = ", points_1 - points_2)

		rt_window_points = points_1 - points_2

		maxima_list = get_maxima_list_reduced(ci_ion_chrom, mp_rt, rt_window_points)

		large_peaks = []

		for rt, intens in maxima_list:
			if intens > threshold:
				q1_index = qi1_ion_chrom.get_index_at_time(rt)
				q2_index = qi2_ion_chrom.get_index_at_time(rt)

				q1_intensity = qi1_ion_chrom.get_intensity_at_index(q1_index)
				q2_intensity = qi2_ion_chrom.get_intensity_at_index(q2_index)

				if q1_intensity > threshold / 2 and q2_intensity > threshold / 2:
					large_peaks.append([rt, intens])

		print(f"found {len(large_peaks):d} peaks above threshold")

		areas = []
		for peak in large_peaks:
			apex = ci_ion_chrom.get_index_at_time(peak[0])
			ia = ci_ion_chrom.intensity_array.tolist()
			area, left, right, l_share, r_share = ion_area(ia, apex, 0)
			areas.append(area)

		########################

		areas.sort()
		if len(areas) > 0:
			biggest_area = areas[-1]
			mp.common_ion_area = biggest_area
			# mp.exact_rt = f"{float(mp_rt) / 60.0:.3f}"
			mp.exact_rt = float(mp_rt) / 60.0
			print("found area:", biggest_area, "at rt:", mp_rt)
		else:
			print("Missing peak at rt = ", mp_rt)
			mp.common_ion_area = None


[docs]def mp_finder(input_matrix: List) -> List[Sample]:
	r"""
	Finds the ``'NA'``\s in the transformed ``area_ci.csv`` file and makes
	:class:`pyms.Gapfill.Class.Sample` objects with them

	:param input_matrix: Data matrix derived from the ``area_ci.csv`` file.

	:rtype:

	:authors: Jairus Bowne, Sean O'Callaghan
	"""  # noqa: D400

	sample_list = []

	try:
		ci_pos = input_matrix[0].index(' "Quant Ion"')
		print("found Quant Ion position:", ci_pos)
	except ValueError:
		ci_pos = input_matrix[0].index('"Quant Ion"')

	uid_pos = input_matrix[0].index("UID")

	# Set up the sample objects
	# All entries on line 1 beyond the Qual Ion position are sample names
	for i, sample_name in enumerate(input_matrix[0][ci_pos:]):
		print(sample_name)
		sample = Sample(sample_name, i + 3)  # add 4 to allow for UID, RT,QualIon
		sample_list.append(sample)

	for line in input_matrix[1:]:
		uid = line[uid_pos]
		common_ion = line[ci_pos]

		qual_ion_1 = uid.split('-')[0]
		qual_ion_2 = uid.split('-')[1]
		rt = uid.split('-')[-1]

		for i, area in enumerate(line[ci_pos:]):
			if area == "NA":
				missing_peak = MissingPeak(common_ion, qual_ion_1, qual_ion_2, rt)
				sample_list[i].add_missing_peak(missing_peak)

	return sample_list


[docs]def write_filled_csv(
		sample_list: List[Sample],
		area_file: PathLike,
		filled_area_file: PathLike,
		) -> None:
	r"""
	Creates a new ``area_ci.csv`` file, replacing NAs with values from the sample_list objects where possible.

	:param sample_list:
	:param area_file: The file ``'area_ci.csv'`` from PyMassSpec output.
	:param filled_area_file: the new output file which has ``'NA'``\s values replaced.

	:authors: Jairus Bowne, Sean O'Callaghan, Dominic Davis-Foster
	"""

	if not is_path(filled_area_file):
		raise TypeError("'filled_area_file' must be a string or a pathlib.Path object")

	filled_area_file = prepare_filepath(filled_area_file)

	df = file2dataframe(area_file)

	uid_list: List[str] = df["UID"]
	rt_list: List[float] = []
	for uid in uid_list:
		rt = uid.split('-')[-1]
		rt_list.append(float(rt))

	for sample_name in df.columns[3:]:

		for sample in sample_list:
			if sample_name in sample.name:
				rt_area_dict = sample.rt_areas
				break
		else:
			raise ValueError(f"Sample {sample_name!r} not found in sample_list.")

		for i, part in enumerate(df[sample_name]):
			if part == "NA":
				try:
					df[sample_name][i] = rt_area_dict[rt_list[i]]
				except KeyError:
					pass

	df.to_csv(filled_area_file, index=False, na_rep="NA")


[docs]def write_filled_rt_csv(
		sample_list: List[Sample],
		rt_file: PathLike,
		filled_rt_file: PathLike,
		) -> None:
	r"""
	Creates a new rt.csv file, replacing ``'NA'``\s with values from the sample_list objects where possible.

	:param sample_list: A list of samples.
	:param rt_file: the file ``rt.csv`` from PyMassSpec output.
	:param filled_rt_file: the new output file which has ``NA`` values replaced.

	:authors: Jairus Bowne, Sean O'Callaghan, Dominic Davis-Foster
	"""

	if not isinstance(filled_rt_file, (str, pathlib.Path)):
		raise TypeError("'filled_rt_file' must be a string or a pathlib.Path object")

	filled_rt_file = prepare_filepath(filled_rt_file)

	df = file2dataframe(rt_file)

	uid_list: List[str] = df["UID"]
	rt_list: List[str] = []
	for uid in uid_list:
		rt = uid.split('-')[-1]
		rt_list.append(rt)

	for sample_name in df.columns[3:]:
		for sample in sample_list:
			if sample_name in sample.name:
				rt_exact_rt_dict = sample.get_mp_rt_exact_rt_dict()
				break
		else:
			raise ValueError(f"Sample {sample_name!r} not found in sample_list.")

		for i, part in enumerate(df[sample_name]):
			if part == "NA":
				try:
					df[sample_name][i] = rt_exact_rt_dict[float(rt_list[i])]
				except KeyError:
					pass

	df.to_csv(filled_rt_file, index=False, na_rep="NA")