Source code for pyms.GCMS.IO.ANDI

"""
Functions for reading ANDI-MS data files.
"""

################################################################################
#                                                                              #
#    PyMassSpec software for processing of mass-spectrometry data              #
#    Copyright (C) 2005-2012 Vladimir Likic                                    #
#    Copyright (C) 2019-2020 Dominic Davis-Foster                              #
#                                                                              #
#    This program is free software; you can redistribute it and/or modify      #
#    it under the terms of the GNU General Public License version 2 as         #
#    published by the Free Software Foundation.                                #
#                                                                              #
#    This program is distributed in the hope that it will be useful,           #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of            #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             #
#    GNU General Public License for more details.                              #
#                                                                              #
#    You should have received a copy of the GNU General Public License         #
#    along with this program; if not, write to the Free Software               #
#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.                 #
#                                                                              #
################################################################################

# stdlib
import os
import pathlib

# 3rd party
from domdf_python_tools.typing import PathLike
from netCDF4 import Dataset  # type: ignore[import]

try:
	# 3rd party
	from mpi4py import MPI  # type: ignore[import]  # noqa: F401
except ModuleNotFoundError:
	pass

# this package
from pyms.GCMS.Class import GCMS_data
from pyms.Spectrum import Scan

__all__ = ["ANDI_reader"]

# netCDF dimension names
__POINT_NUMBER = "point_number"
__SCAN_NUMBER = "scan_number"

# the keys used to create and retrieve certain data from the NetCDF file
__MASS_STRING = "mass_values"
__INTENSITY_STRING = "intensity_values"
__TIME_STRING = "scan_acquisition_time"
__POINT_COUNT = "point_count"


[docs]def ANDI_reader(file_name: PathLike) -> GCMS_data: """ A reader for ANDI-MS NetCDF files. :param file_name: The path of the ANDI-MS file :return: GC-MS data object :authors: Qiao Wang, Andrew Isaac, Vladimir Likic, Dominic Davis-Foster """ if not isinstance(file_name, (str, pathlib.Path)): raise TypeError("'file_name' must be a string or a pathlib.Path object") if not os.path.isfile(file_name): # netCDF4 1.6.0 has stopped raising FileNotFoundError # and instead creates an empty file, for some reason. raise FileNotFoundError(2, "No such file or directory", file_name) rootgrp = Dataset(file_name, "r+", format="NETCDF3_CLASSIC") # TODO: find out if netCDF4 throws specific errors that we can use here print(f" -> Reading netCDF file '{file_name}'") scan_list = [] mass = rootgrp.variables[__MASS_STRING][:] intensity = rootgrp.variables[__INTENSITY_STRING][:] scan_lengths = rootgrp.variables["point_count"] # The number of data points in each scan mass_values = mass.tolist() intensity_values = intensity.tolist() if len(mass_values) != len(intensity_values): raise ValueError("The lengths of the mass and intensity lists differ!") offset = 0 for idx, length in enumerate(scan_lengths): mass_list = mass_values[offset:offset + length] assert len(mass_values[offset:offset + length]) == length intensity_list = intensity_values[offset:offset + length] assert len(intensity_values[offset:offset + length]) == length scan_list.append(Scan(mass_list, intensity_list)) offset += length assert offset == len(mass_values) time = rootgrp.variables[__TIME_STRING][:] time_list = time.tolist() # sanity check if len(time_list) != len(scan_list): raise ValueError("number of time points does not equal the number of scans") return GCMS_data(time_list, scan_list)
# # def ANDI_writer(file_name: str, im: IntensityMatrix): # """ # A writer for ANDI-MS NetCDF files # # :param file_name: The name of the ANDI-MS file # :param im: The IntensityMatrix # # :author: Andrew Isaac # # .. TODO:: finish this # """ # # # netCDF header info for compatability # # attributes # # dataset_completeness 0 CHAR 6 C1+C2 # # dataset_origin 4 CHAR 16 Santa Clara, CA # # experiment_date_time_stamp 7 CHAR 20 20081218044500+1100 # # experiment_title 6 CHAR 7 mix ma # # experiment_type 10 CHAR 25 Centroided Mass Spectrum # # external_file_ref_0 9 CHAR 8 MA_5C.M # # languages 3 CHAR 8 English # # ms_template_revision 1 CHAR 6 1.0.1 # # netcdf_file_date_time_stamp 5 CHAR 20 20090114001531+1100 # # netcdf_revision 2 CHAR 6 2.3.2 # # number_of_times_calibrated 12 INT 1 0 # # number_of_times_processed 11 INT 1 1 # # operator_name 8 CHAR 12 Dave and Su # # raw_data_intensity_format 25 CHAR 6 Float # # raw_data_mass_format 23 CHAR 6 Float # # raw_data_time_format 24 CHAR 6 Short # # sample_state 13 CHAR 12 Other State # # test_detector_type 18 CHAR 20 Electron Multiplier # # test_ionization_mode 16 CHAR 16 Electron Impact # # test_ionization_polarity 17 CHAR 18 Positive Polarity # # test_ms_inlet 15 CHAR 17 Capillary Direct # # test_resolution_type 19 CHAR 20 Constant Resolution # # test_scan_direction 21 CHAR 3 Up # # test_scan_function 20 CHAR 10 Mass Scan # # test_scan_law 22 CHAR 7 Linear # # test_separation_type 14 CHAR 18 No Chromatography # # # dimensions # # _128_byte_string 6 128 # # _16_byte_string 3 16 # # _255_byte_string 7 255 # # _2_byte_string 0 2 # # _32_byte_string 4 32 # # _4_byte_string 1 4 # # _64_byte_string 5 64 # # _8_byte_string 2 8 # # error_number 10 1 # # instrument_number 12 1 # # point_number 9 554826 X # # range 8 2 # # scan_number 11 9865 # # # variables # # a_d_coaddition_factor 2 SHORT 0 scan_number(9865) # # a_d_sampling_rate 1 DOUBLE 0 scan_number(9865) # # actual_scan_number 7 INT 0 scan_number(9865) # # error_log 0 CHAR 0 error_number(1), _64_byte_string(64) # # flag_count 15 INT 0 scan_number(9865) # # instrument_app_version 27 CHAR 0 instrument_number(1), # # _32_byte_string(32) # # instrument_comments 28 CHAR 0 instrument_number(1), # # _32_byte_string(32) # # instrument_fw_version 25 CHAR 0 instrument_number(1), # # _32_byte_string(32) # # instrument_id 20 CHAR 0 instrument_number(1), # # _32_byte_string(32) # # instrument_mfr 21 CHAR 0 instrument_number(1), # # _32_byte_string(32) # # instrument_model 22 CHAR 0 instrument_number(1), # # _32_byte_string(32) # # instrument_name 19 CHAR 0 instrument_number(1), # # _32_byte_string(32) # # instrument_os_version 26 CHAR 0 instrument_number(1), # # _32_byte_string(32) # # instrument_serial_no 23 CHAR 0 instrument_number(1), # # _32_byte_string(32) # # instrument_sw_version 24 CHAR 0 instrument_number(1), # # _32_byte_string(32) # # intensity_values 18 FLOAT 3 point_number(554826) # # inter_scan_time 5 DOUBLE 0 scan_number(9865) # # mass_range_max 10 DOUBLE 0 scan_number(9865) # # mass_range_min 9 DOUBLE 0 scan_number(9865) # # mass_values 16 FLOAT 2 point_number(554826) # # point_count 14 INT 0 scan_number(9865) # # resolution 6 DOUBLE 0 scan_number(9865) # # scan_acquisition_time 3 DOUBLE 0 scan_number(9865) # # scan_duration 4 DOUBLE 0 scan_number(9865) # # scan_index 13 INT 0 scan_number(9865) # # time_range_max 12 DOUBLE 0 scan_number(9865) # # time_range_min 11 DOUBLE 0 scan_number(9865) # # time_values 17 FLOAT 2 point_number(554826) # # total_intensity 8 DOUBLE 1 scan_number(9865) # # # variable information # # intensity_values attributes # # # name idx type len value # # -------------------- --- ---- --- ----- # # add_offset 1 DOUBLE 1 0.0 # # scale_factor 2 DOUBLE 1 1.0 # # units 0 CHAR 26 Arbitrary Intensity Units # # # mass_values attributes # # # name idx type len value # # -------------------- --- ---- --- ----- # # scale_factor 1 DOUBLE 1 1.0 # # units 0 CHAR 4 M/Z # # # time_values attributes # # # name idx type len value # # -------------------- --- ---- --- ----- # # scale_factor 1 DOUBLE 1 1.0 # # units 0 CHAR 8 Seconds # # # total_intensity attributes # # # name idx type len value # # -------------------- --- ---- --- ----- # # units 0 CHAR 26 Arbitrary Intensity Units # # if not isinstance(file_name, str): # raise TypeError("'file_name' must be a string") # try: # # Open netCDF file in overwrite mode, creating it if inexistent. # nc = CDF(file_name, NC.WRITE | NC.TRUNC | NC.CREATE) # # Automatically set define and data modes. # nc.automode() # except CDFError: # raise IOError(f"Cannot create file '{file_name}'") # # mass_list = im.mass_list # time_list = im.time_list # # # direct access, don't modify # intensity_matrix = im.intensity_array # # # compress by ignoring zero intensities # # included for consistency with imported netCDF format # mass_values = [] # intensity_values = [] # point_count_values = [] # for row in range(len(intensity_matrix)): # pc = 0 # point count # for col in range(len(intensity_matrix[0])): # all rows same len # if (intensity_matrix[row][col] > 0): # mass_values.append(mass_list[col]) # intensity_values.append(intensity_matrix[row][col]) # pc += 1 # point_count_values.append(pc) # # # sanity checks # if not len(time_list) == len(point_count_values): # raise ValueError("number of time points does not equal the number of scans") # # # create dimensions # # total number of data points # dim_point_number = nc.def_dim(__POINT_NUMBER, len(mass_values)) # # number of scans # dim_scan_number = nc.def_dim(__SCAN_NUMBER, len(point_count_values)) # # # create variables # # points # var_mass_values = nc.def_var(__MASS_STRING, NC.FLOAT, dim_point_number) # var_intensity_values = nc.def_var(__INTENSITY_STRING, NC.FLOAT, dim_point_number) # # scans # var_time_list = nc.def_var(__TIME_STRING, NC.DOUBLE, dim_scan_number) # var_point_count_values = nc.def_var(__POINT_COUNT, NC.INT, dim_scan_number) # # # populate variables # # points # var_mass_values[:] = mass_values # var_intensity_values[:] = intensity_values # # scans # var_time_list[:] = time_list # var_point_count_values[:] = point_count_values # # # close file # nc.close()