Source code for pyms.GCMS.IO.JCAMP

"""
Functions for I/O of data in JCAMP-DX format.
"""

################################################################################
#                                                                              #
#    PyMassSpec software for processing of mass-spectrometry data              #
#    Copyright (C) 2005-2012 Vladimir Likic                                    #
#    Copyright (C) 2019-2020 Dominic Davis-Foster                              #
#                                                                              #
#    Parts based on 'jcamp' by Nathan Hagen									   #
# 	 https://github.com/nzhagen/jcamp										   #
# 	 Licensed under the X11 License											   #
#                                                                              #
#    This program is free software; you can redistribute it and/or modify      #
#    it under the terms of the GNU General Public License version 2 as         #
#    published by the Free Software Foundation.                                #
#                                                                              #
#    This program is distributed in the hope that it will be useful,           #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of            #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             #
#    GNU General Public License for more details.                              #
#                                                                              #
#    You should have received a copy of the GNU General Public License         #
#    along with this program; if not, write to the Free Software               #
#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.                 #
#                                                                              #
################################################################################

# stdlib
import sys
from pathlib import Path
from typing import Any, List, MutableMapping, Union

# 3rd party
from domdf_python_tools.paths import PathPlus

# this package
from pyms.GCMS.Class import GCMS_data
from pyms.Spectrum import Scan
from pyms.Utils.IO import prepare_filepath
from pyms.Utils.jcamp import header_info_fields, xydata_tags
from pyms.Utils.Math import is_float
from pyms.Utils.Utils import is_path

__all__ = ["JCAMP_reader"]


def _removeprefix(string: str, prefix: str):
	if sys.version_info >= (3, 9):
		return string.removeprefix(prefix)
	else:
		if string.startswith(prefix):
			return string[len(prefix):]
		return string


[docs]def JCAMP_reader(file_name: Union[str, Path]) -> GCMS_data: """ Generic reader for JCAMP DX files. :param file_name: Path of the file to read :return: GC-MS data object :authors: Qiao Wang, Andrew Isaac, Vladimir Likic, David Kainer, Dominic Davis-Foster (pathlib support) """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = PathPlus(prepare_filepath(file_name, mkdirs=False)) print(f" -> Reading JCAMP file {file_name.as_posix()!r}") lines_list = file_name.read_lines() time_list: List[float] = [] scan_list: List[Scan] = [] header_info: MutableMapping[Any, Any] = {} # Dictionary containing header information line_idx = 0 while line_idx < len(lines_list): line = lines_list[line_idx] if line.strip(): if line.startswith("##"): # Label label, value = line.split('=', 1) label = _removeprefix(label, "##").upper() value = value.strip() if "PAGE" in label: if "T=" in value: # PAGE contains retention time starting with T= # FileConverter Pro style time = float(_removeprefix(value, "T=")) # rt for the scan to be submitted time_list.append(time) elif "RETENTION_TIME" in label: # OpenChrom style time = float(value) # rt for the scan to be submitted # Check to make sure time is not already in the time list; # Can happen when both ##PAGE and ##RETENTION_TIME are specified if time_list[-1] != time: time_list.append(time) elif label in header_info_fields: if value.isdigit(): header_info[label] = int(value) elif is_float(value): header_info[label] = float(value) else: header_info[label] = value elif label in xydata_tags: # Read ahead to find all XY data xydata_line_idx = line_idx + 1 xy_data_lines: List[str] = [] while xydata_line_idx < len(lines_list): xy_data_line = lines_list[xydata_line_idx] if xy_data_line.startswith("##"): break else: xy_data_lines.append(xy_data_line) xydata_line_idx += 1 line_idx += len(xy_data_lines) mass_list = [] intensity_list = [] for xy_data_line in xy_data_lines: if not xy_data_line.strip(): continue elements = xy_data_line.strip().rstrip(',').split(',') if len(elements) % 2: print(elements) raise ValueError(f"Expected an even number of values, got {len(elements)}") for mass, intensity in zip(elements[::2], elements[1::2]): mass_list.append(float(mass.strip())) intensity_list.append(float(intensity.strip())) scan_list.append(Scan(mass_list, intensity_list)) line_idx += 1 # End of while loop # sanity check time_len = len(time_list) scan_len = len(scan_list) if time_len != scan_len: print(time_list) print(scan_list) raise ValueError(f"Number of time points ({time_len}) does not equal the number of scans ({scan_len})") return GCMS_data(time_list, scan_list)