From b705c997cbd6acd05e33d608f6967be273db0669 Mon Sep 17 00:00:00 2001 From: Adam Paquette <acp263@nau.edu> Date: Tue, 26 Dec 2017 14:00:56 -0700 Subject: [PATCH] Added io files and there associated tests from pysat. --- plio/io/io_ccam_pds.py | 201 ++++++++++++++++++++++++++++++ plio/io/io_ccs.py | 198 +++++++++++++++++++++++++++++ plio/io/tests/test_io_ccam_pds.py | 19 +++ plio/io/tests/test_io_ccs.py | 19 +++ 4 files changed, 437 insertions(+) create mode 100644 plio/io/io_ccam_pds.py create mode 100644 plio/io/io_ccs.py create mode 100644 plio/io/tests/test_io_ccam_pds.py create mode 100644 plio/io/tests/test_io_ccs.py diff --git a/plio/io/io_ccam_pds.py b/plio/io/io_ccam_pds.py new file mode 100644 index 0000000..2a9a767 --- /dev/null +++ b/plio/io/io_ccam_pds.py @@ -0,0 +1,201 @@ +# This code is used to read individual ChemCam files +# Header data is stored as attributes of the data frame +# White space is stripped from the column names +import os + +import numpy as np +import pandas as pd +import scipy.io as io + +from plio.utils.utils import lookup +from plio.utils.utils import file_search + + +def CCAM_CSV(input_data, ave=True): + try: + df = pd.read_csv(input_data, header=14, engine='c') + cols = list(df.columns.values) + df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names + df.set_index(['wave'], inplace=True) # use wavelengths as indices + # read the file header and put information into the dataframe as new columns + metadata = pd.read_csv(input_data, sep='=', nrows=14, comment=',', engine='c', index_col=0, header=None) + except: + try: # handle files with an extra header row containing temperature + df = pd.read_csv(input_data, header=15, engine='c') + cols = list(df.columns.values) + df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names + df.set_index(['wave'], inplace=True) # use wavelengths as indices + # read the file header and put information into the dataframe as new columns + metadata = pd.read_csv(input_data, sep='=', nrows=15, comment=',', engine='c', index_col=0, header=None) + except: # handle files with an extra header row containing temperature and target name + df = pd.read_csv(input_data, header=16, engine='c') + cols = list(df.columns.values) + df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names + df.set_index(['wave'], inplace=True) # use wavelengths as indices + # read the file header and put information into the dataframe as new columns + metadata = pd.read_csv(input_data, sep='=', nrows=16, comment=',', engine='c', index_col=0, header=None) + + if ave: + df = pd.DataFrame(df['mean']) + else: + df = df.drop(['mean', 'median'], axis=1) + df.index = [['wvl'] * len(df.index), + df.index.values.round(4)] # create multiindex so spectra can be easily extracted with a single key + df = df.T # transpose so that each spectrum is a row + + # remove extraneous stuff from the metadataindices + metadata.index = [i.strip().strip('# ').replace(' FLOAT', '').lower() for i in metadata.index.values] + metadata = metadata.T + + # extract info from the file name + fname = os.path.basename(input_data) + metadata['sclock'] = fname[4:13] + metadata['seqid'] = fname[25:34].upper() + metadata['Pversion'] = fname[34:36] + + # duplicate the metadata for each row in the df + if not ave: + metadata = metadata.append([metadata] * (len(df.index) - 1), ignore_index=True) + metadata.index = df.index # make the indices match + metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values] # make the columns into multiindex + df = pd.concat([metadata, df], axis=1) # combine the spectra with the metadata + return df + + +def CCAM_SAV(input_data, ave=True): + # read the IDL .SAV file + + data = io.readsav(input_data, python_dict=True) + + # put the spectra into data frames and combine them + df_UV = pd.DataFrame(data['uv'], index=data['defuv']) + df_VIS = pd.DataFrame(data['vis'], index=data['defvis']) + df_VNIR = pd.DataFrame(data['vnir'], index=data['defvnir']) + df_spect = pd.concat([df_UV, df_VIS, df_VNIR]) + df_spect.columns = ['shot' + str(i + 1) for i in + df_spect.columns] # add 1 to the columns so they correspond to shot number + + df_aUV = pd.DataFrame(data['auv'], index=data['defuv'], columns=['average']) + df_aVIS = pd.DataFrame(data['avis'], index=data['defvis'], columns=['average']) + df_aVNIR = pd.DataFrame(data['avnir'], index=data['defvnir'], columns=['average']) + df_ave = pd.concat([df_aUV, df_aVIS, df_aVNIR]) + + df_mUV = pd.DataFrame(data['muv'], index=data['defuv'], columns=['median']) + df_mVIS = pd.DataFrame(data['mvis'], index=data['defvis'], columns=['median']) + df_mVNIR = pd.DataFrame(data['mvnir'], index=data['defvnir'], columns=['median']) + df_med = pd.concat([df_mUV, df_mVIS, df_mVNIR]) + + df = pd.concat([df_spect, df_ave, df_med], axis=1) + # create multiindex to access wavelength values + # also, round the wavlength values to a more reasonable level of precision + df.index = [['wvl'] * len(df.index), df.index.values.round(4)] + # transpose so that spectra are rows rather than columns + df = df.T + + # extract metadata from the file name and add it to the data frame + # use the multiindex label "meta" for all metadata + + fname = os.path.basename(input_data) + + # for some reason, some ChemCam files have the 'darkname' key, others call it 'darkspect' + # this try-except pair converts to 'darkname' when needed + try: + data['darkname'] + except: + data['darkname'] = data['darkspec'] + + metadata = [fname, + fname[4:13], + fname[25:34].upper(), + fname[34:36], + data['continuumvismin'], + data['continuumvnirmin'], + data['continuumuvmin'], + data['continuumvnirend'], + data['distt'], + data['darkname'], + data['nshots'], + data['dnoiseiter'], + data['dnoisesig'], + data['matchedfilter']] + metadata = np.tile(metadata, (len(df.index), 1)) + metadata_cols = list(zip(['meta'] * len(df.index), ['file', + 'sclock', + 'seqid', + 'Pversion', + 'continuumvismin', + 'continuumvnirmin', + 'continuumuvmin', + 'continuumvnirend', + 'distt', + 'dark', + 'nshots', + 'dnoiseiter', + 'dnoisesig', + 'matchedfilter'])) + metadata = pd.DataFrame(metadata, columns=pd.MultiIndex.from_tuples(metadata_cols), index=df.index) + + df = pd.concat([metadata, df], axis=1) + if ave == True: + df = df.loc['average'] + df = df.to_frame().T + else: + pass + + return df + + +def ccam_batch(directory, searchstring='*.csv', to_csv=None, lookupfile=None, ave=True, progressbar=None): + # Determine if the file is a .csv or .SAV + if '.sav' in searchstring.lower(): + is_sav = True + else: + is_sav = False + filelist = file_search(directory, searchstring) + basenames = np.zeros_like(filelist) + sclocks = np.zeros_like(filelist) + P_version = np.zeros_like(filelist, dtype='int') + + # Extract the sclock and version for each file and ensure that only one + # file per sclock is being read, and that it is the one with the highest version number + for i, name in enumerate(filelist): + basenames[i] = os.path.basename(name) + sclocks[i] = basenames[i][4:13] # extract the sclock + P_version[i] = basenames[i][-5:-4] # extract the version + + sclocks_unique = np.unique(sclocks) # find unique sclocks + filelist_new = np.array([], dtype='str') + for i in sclocks_unique: + match = (sclocks == i) # find all instances with matching sclocks + maxP = P_version[match] == max(P_version[match]) # find the highest version among these files + filelist_new = np.append(filelist_new, filelist[match][maxP]) # keep only the file with thei highest version + + filelist = filelist_new + # Should add a progress bar for importing large numbers of files + dt = [] + + for i, file in enumerate(filelist): + print(file) + if is_sav: + tmp = CCAM_SAV(file, ave=ave) + else: + tmp = CCAM_CSV(file, ave=ave) + if i == 0: + combined = tmp + else: + # This ensures that rounding errors are not causing mismatches in columns + cols1 = list(combined['wvl'].columns) + cols2 = list(tmp['wvl'].columns) + if set(cols1) == set(cols2): + combined = pd.concat([combined, tmp]) + else: + print("Wavelengths don't match!") + + combined.loc[:, ('meta', 'sclock')] = pd.to_numeric(combined.loc[:, ('meta', 'sclock')]) + + if lookupfile is not None: + + combined = lookup(combined, lookupfile=lookupfile.replace('[','').replace(']','').replace("'",'').replace(' ','').split(',')) + if to_csv is not None: + combined.to_csv(to_csv) + return combined diff --git a/plio/io/io_ccs.py b/plio/io/io_ccs.py new file mode 100644 index 0000000..08cd68a --- /dev/null +++ b/plio/io/io_ccs.py @@ -0,0 +1,198 @@ +# This code is used to read individual ChemCam CCS files +# Header data is stored as attributes of the data frame +# White space is stripped from the column names +import os +import time + +import numpy as np +import pandas as pd +import scipy.io as io + +from plio.utils.utils import lookup +from plio.utils.utils import file_search + + +def CCAM_CSV(input_data): + try: + df = pd.read_csv(input_data, header=14, engine='c') + cols = list(df.columns.values) + df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names + df.set_index(['wave'], inplace=True) # use wavelengths as indices + # read the file header and put information into the dataframe as new columns + metadata = pd.read_csv(input_data, sep='=', nrows=14, comment=',', engine='c', index_col=0, header=None) + + except: # handle files with an extra header row containing temperature + df = pd.read_csv(input_data, header=15, engine='c') + cols = list(df.columns.values) + df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names + df.set_index(['wave'], inplace=True) # use wavelengths as indices + # read the file header and put information into the dataframe as new columns + metadata = pd.read_csv(input_data, sep='=', nrows=15, comment=',', engine='c', index_col=0, header=None) + + df.index = [['wvl'] * len(df.index), + df.index.values.round(4)] # create multiindex so spectra can be easily extracted with a single key + df = df.T # transpose so that each spectrum is a row + + # remove extraneous stuff from the metadataindices + metadata.index = [i.strip().strip('# ').replace(' FLOAT', '').lower() for i in metadata.index.values] + metadata = metadata.T + + # extract info from the file name + fname = os.path.basename(input_data) + metadata['sclock'] = fname[4:13] + metadata['seqid'] = fname[25:34].upper() + metadata['Pversion'] = fname[34:36] + + # duplicate the metadata for each row in the df + metadata = metadata.append([metadata] * (len(df.index) - 1), ignore_index=True) + metadata.index = df.index # make the indices match + metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values] # make the columns into multiindex + df = pd.concat([metadata, df], axis=1) # combine the spectra with the metadata + return df + + +def CCAM_SAV(input_data, ave=True): + # read the IDL .SAV file + + data = io.readsav(input_data, python_dict=True) + + # put the spectra into data frames and combine them + df_UV = pd.DataFrame(data['uv'], index=data['defuv']) + df_VIS = pd.DataFrame(data['vis'], index=data['defvis']) + df_VNIR = pd.DataFrame(data['vnir'], index=data['defvnir']) + df_spect = pd.concat([df_UV, df_VIS, df_VNIR]) + df_spect.columns = ['shot' + str(i + 1) for i in + df_spect.columns] # add 1 to the columns so they correspond to shot number + + df_aUV = pd.DataFrame(data['auv'], index=data['defuv'], columns=['average']) + df_aVIS = pd.DataFrame(data['avis'], index=data['defvis'], columns=['average']) + df_aVNIR = pd.DataFrame(data['avnir'], index=data['defvnir'], columns=['average']) + df_ave = pd.concat([df_aUV, df_aVIS, df_aVNIR]) + + df_mUV = pd.DataFrame(data['muv'], index=data['defuv'], columns=['median']) + df_mVIS = pd.DataFrame(data['mvis'], index=data['defvis'], columns=['median']) + df_mVNIR = pd.DataFrame(data['mvnir'], index=data['defvnir'], columns=['median']) + df_med = pd.concat([df_mUV, df_mVIS, df_mVNIR]) + + df = pd.concat([df_spect, df_ave, df_med], axis=1) + # create multiindex to access wavelength values + # also, round the wavlength values to a more reasonable level of precision + df.index = [['wvl'] * len(df.index), df.index.values.round(4)] + # transpose so that spectra are rows rather than columns + df = df.T + + # extract metadata from the file name and add it to the data frame + # use the multiindex label "meta" for all metadata + + fname = os.path.basename(input_data) + + # for some reason, some ChemCam files have the 'darkname' key, others call it 'darkspect' + # this try-except pair converts to 'darkname' when needed + try: + data['darkname'] + except: + data['darkname'] = data['darkspec'] + + metadata = [fname, + fname[4:13], + fname[25:34].upper(), + fname[34:36], + data['continuumvismin'], + data['continuumvnirmin'], + data['continuumuvmin'], + data['continuumvnirend'], + data['distt'], + data['darkname'], + data['nshots'], + data['dnoiseiter'], + data['dnoisesig'], + data['matchedfilter']] + metadata = np.tile(metadata, (len(df.index), 1)) + metadata_cols = list(zip(['meta'] * len(df.index), ['file', + 'sclock', + 'seqid', + 'Pversion', + 'continuumvismin', + 'continuumvnirmin', + 'continuumuvmin', + 'continuumvnirend', + 'distt', + 'dark', + 'nshots', + 'dnoiseiter', + 'dnoisesig', + 'matchedfilter'])) + metadata = pd.DataFrame(metadata, columns=pd.MultiIndex.from_tuples(metadata_cols), index=df.index) + + df = pd.concat([metadata, df], axis=1) + if ave == True: + df = df.loc['average'] + df = df.to_frame().T + else: + pass + + return df + + +def ccam_batch(directory, searchstring='*.csv', to_csv=None, lookupfile=None, ave=True, progressbar=None): + # Determine if the file is a .csv or .SAV + if '.sav' in searchstring.lower(): + is_sav = True + else: + is_sav = False + filelist = file_search(directory, searchstring) + basenames = np.zeros_like(filelist) + sclocks = np.zeros_like(filelist) + P_version = np.zeros_like(filelist, dtype='int') + + # Extract the sclock and version for each file and ensure that only one + # file per sclock is being read, and that it is the one with the highest version number + for i, name in enumerate(filelist): + basenames[i] = os.path.basename(name) + sclocks[i] = basenames[i][4:13] # extract the sclock + P_version[i] = basenames[i][-5:-4] # extract the version + + sclocks_unique = np.unique(sclocks) # find unique sclocks + filelist_new = np.array([], dtype='str') + for i in sclocks_unique: + match = (sclocks == i) # find all instances with matching sclocks + maxP = P_version[match] == max(P_version[match]) # find the highest version among these files + filelist_new = np.append(filelist_new, filelist[match][maxP]) # keep only the file with thei highest version + + filelist = filelist_new + # Should add a progress bar for importing large numbers of files + dt = [] + + for i in filelist: + print(i) + try: + if is_sav: + t = time.time() + tmp = CCAM_SAV(i, ave=ave) + dt.append(time.time() - t) + else: + t = time.time() + tmp = CCAM_CSV(i) + + dt.append(time.time() - t) + if i == filelist[0]: + combined = tmp + + else: + # This ensures that rounding errors are not causing mismatches in columns + cols1 = list(combined['wvl'].columns) + cols2 = list(tmp['wvl'].columns) + if set(cols1) == set(cols2): + combined = pd.concat([combined, tmp]) + else: + print("Wavelengths don't match!") + except: + pass + + combined.loc[:, ('meta', 'sclock')] = pd.to_numeric(combined.loc[:, ('meta', 'sclock')]) + + if lookupfile is not None: + combined = lookup(combined, lookupfile=lookupfile) + if to_csv is not None: + combined.to_csv(to_csv) + return combined diff --git a/plio/io/tests/test_io_ccam_pds.py b/plio/io/tests/test_io_ccam_pds.py new file mode 100644 index 0000000..03fec02 --- /dev/null +++ b/plio/io/tests/test_io_ccam_pds.py @@ -0,0 +1,19 @@ +import os +import sys +import unittest + +sys.path.insert(0, os.path.abspath('..')) + +from plio.examples import get_path +from plio.io import io_ccam_pds + +class Test_CCAM_IO(unittest.TestCase): + + def setUp(self): + self.examplefile = get_path('CL5_398645626CCS_F0030004CCAM02013P3.csv') + + def test_14_item_header_csv(self): + io_ccam_pds.CCAM_CSV(self.examplefile) + +if __name__ == '__main__': + unittest.main() diff --git a/plio/io/tests/test_io_ccs.py b/plio/io/tests/test_io_ccs.py new file mode 100644 index 0000000..b2cc814 --- /dev/null +++ b/plio/io/tests/test_io_ccs.py @@ -0,0 +1,19 @@ +import os +import sys +import unittest + +sys.path.insert(0, os.path.abspath('..')) + +from plio.examples import get_path +from plio.io import io_ccs + +class Test_CSS_IO(unittest.TestCase): + + def setUp(self): + self.examplefile = get_path('CL5_398645626CCS_F0030004CCAM02013P3.csv') + + def test_14_item_header_csv(self): + io_ccs.CCAM_CSV(self.examplefile) + +if __name__ == '__main__': + unittest.main() -- GitLab