From b705c997cbd6acd05e33d608f6967be273db0669 Mon Sep 17 00:00:00 2001
From: Adam Paquette <acp263@nau.edu>
Date: Tue, 26 Dec 2017 14:00:56 -0700
Subject: [PATCH] Added io files and there associated tests from pysat.

---
 plio/io/io_ccam_pds.py            | 201 ++++++++++++++++++++++++++++++
 plio/io/io_ccs.py                 | 198 +++++++++++++++++++++++++++++
 plio/io/tests/test_io_ccam_pds.py |  19 +++
 plio/io/tests/test_io_ccs.py      |  19 +++
 4 files changed, 437 insertions(+)
 create mode 100644 plio/io/io_ccam_pds.py
 create mode 100644 plio/io/io_ccs.py
 create mode 100644 plio/io/tests/test_io_ccam_pds.py
 create mode 100644 plio/io/tests/test_io_ccs.py

diff --git a/plio/io/io_ccam_pds.py b/plio/io/io_ccam_pds.py
new file mode 100644
index 0000000..2a9a767
--- /dev/null
+++ b/plio/io/io_ccam_pds.py
@@ -0,0 +1,201 @@
+# This code is used to read individual ChemCam files
+# Header data is stored as attributes of the data frame
+# White space is stripped from the column names
+import os
+
+import numpy as np
+import pandas as pd
+import scipy.io as io
+
+from plio.utils.utils import lookup
+from plio.utils.utils import file_search
+
+
+def CCAM_CSV(input_data, ave=True):
+    try:
+        df = pd.read_csv(input_data, header=14, engine='c')
+        cols = list(df.columns.values)
+        df.columns = [i.strip().replace('# ', '') for i in cols]  # strip whitespace from column names
+        df.set_index(['wave'], inplace=True)  # use wavelengths as indices
+        # read the file header and put information into the dataframe as new columns
+        metadata = pd.read_csv(input_data, sep='=', nrows=14, comment=',', engine='c', index_col=0, header=None)
+    except:
+        try:  # handle files with an extra header row containing temperature
+            df = pd.read_csv(input_data, header=15, engine='c')
+            cols = list(df.columns.values)
+            df.columns = [i.strip().replace('# ', '') for i in cols]  # strip whitespace from column names
+            df.set_index(['wave'], inplace=True)  # use wavelengths as indices
+            # read the file header and put information into the dataframe as new columns
+            metadata = pd.read_csv(input_data, sep='=', nrows=15, comment=',', engine='c', index_col=0, header=None)
+        except:  # handle files with an extra header row containing temperature and target name
+            df = pd.read_csv(input_data, header=16, engine='c')
+            cols = list(df.columns.values)
+            df.columns = [i.strip().replace('# ', '') for i in cols]  # strip whitespace from column names
+            df.set_index(['wave'], inplace=True)  # use wavelengths as indices
+            # read the file header and put information into the dataframe as new columns
+            metadata = pd.read_csv(input_data, sep='=', nrows=16, comment=',', engine='c', index_col=0, header=None)
+
+    if ave:
+        df = pd.DataFrame(df['mean'])
+    else:
+        df = df.drop(['mean', 'median'], axis=1)
+    df.index = [['wvl'] * len(df.index),
+                df.index.values.round(4)]  # create multiindex so spectra can be easily extracted with a single key
+    df = df.T  # transpose so that each spectrum is a row
+
+    # remove extraneous stuff from the metadataindices
+    metadata.index = [i.strip().strip('# ').replace(' FLOAT', '').lower() for i in metadata.index.values]
+    metadata = metadata.T
+
+    # extract info from the file name
+    fname = os.path.basename(input_data)
+    metadata['sclock'] = fname[4:13]
+    metadata['seqid'] = fname[25:34].upper()
+    metadata['Pversion'] = fname[34:36]
+
+    # duplicate the metadata for each row in the df
+    if not ave:
+        metadata = metadata.append([metadata] * (len(df.index) - 1), ignore_index=True)
+    metadata.index = df.index  # make the indices match
+    metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values]  # make the columns into multiindex
+    df = pd.concat([metadata, df], axis=1)  # combine the spectra with the metadata
+    return df
+
+
+def CCAM_SAV(input_data, ave=True):
+    # read the IDL .SAV file
+
+    data = io.readsav(input_data, python_dict=True)
+
+    # put the spectra into data frames and combine them
+    df_UV = pd.DataFrame(data['uv'], index=data['defuv'])
+    df_VIS = pd.DataFrame(data['vis'], index=data['defvis'])
+    df_VNIR = pd.DataFrame(data['vnir'], index=data['defvnir'])
+    df_spect = pd.concat([df_UV, df_VIS, df_VNIR])
+    df_spect.columns = ['shot' + str(i + 1) for i in
+                        df_spect.columns]  # add 1 to the columns so they correspond to shot number
+
+    df_aUV = pd.DataFrame(data['auv'], index=data['defuv'], columns=['average'])
+    df_aVIS = pd.DataFrame(data['avis'], index=data['defvis'], columns=['average'])
+    df_aVNIR = pd.DataFrame(data['avnir'], index=data['defvnir'], columns=['average'])
+    df_ave = pd.concat([df_aUV, df_aVIS, df_aVNIR])
+
+    df_mUV = pd.DataFrame(data['muv'], index=data['defuv'], columns=['median'])
+    df_mVIS = pd.DataFrame(data['mvis'], index=data['defvis'], columns=['median'])
+    df_mVNIR = pd.DataFrame(data['mvnir'], index=data['defvnir'], columns=['median'])
+    df_med = pd.concat([df_mUV, df_mVIS, df_mVNIR])
+
+    df = pd.concat([df_spect, df_ave, df_med], axis=1)
+    # create multiindex to access wavelength values
+    # also, round the wavlength values to a more reasonable level of precision
+    df.index = [['wvl'] * len(df.index), df.index.values.round(4)]
+    # transpose so that spectra are rows rather than columns
+    df = df.T
+
+    # extract metadata from the file name and add it to the data frame
+    # use the multiindex label "meta" for all metadata
+
+    fname = os.path.basename(input_data)
+
+    # for some reason, some ChemCam files have the 'darkname' key, others call it 'darkspect'
+    # this try-except pair converts to 'darkname' when needed
+    try:
+        data['darkname']
+    except:
+        data['darkname'] = data['darkspec']
+
+    metadata = [fname,
+                fname[4:13],
+                fname[25:34].upper(),
+                fname[34:36],
+                data['continuumvismin'],
+                data['continuumvnirmin'],
+                data['continuumuvmin'],
+                data['continuumvnirend'],
+                data['distt'],
+                data['darkname'],
+                data['nshots'],
+                data['dnoiseiter'],
+                data['dnoisesig'],
+                data['matchedfilter']]
+    metadata = np.tile(metadata, (len(df.index), 1))
+    metadata_cols = list(zip(['meta'] * len(df.index), ['file',
+                                                        'sclock',
+                                                        'seqid',
+                                                        'Pversion',
+                                                        'continuumvismin',
+                                                        'continuumvnirmin',
+                                                        'continuumuvmin',
+                                                        'continuumvnirend',
+                                                        'distt',
+                                                        'dark',
+                                                        'nshots',
+                                                        'dnoiseiter',
+                                                        'dnoisesig',
+                                                        'matchedfilter']))
+    metadata = pd.DataFrame(metadata, columns=pd.MultiIndex.from_tuples(metadata_cols), index=df.index)
+
+    df = pd.concat([metadata, df], axis=1)
+    if ave == True:
+        df = df.loc['average']
+        df = df.to_frame().T
+    else:
+        pass
+
+    return df
+
+
+def ccam_batch(directory, searchstring='*.csv', to_csv=None, lookupfile=None, ave=True, progressbar=None):
+    # Determine if the file is a .csv or .SAV
+    if '.sav' in searchstring.lower():
+        is_sav = True
+    else:
+        is_sav = False
+    filelist = file_search(directory, searchstring)
+    basenames = np.zeros_like(filelist)
+    sclocks = np.zeros_like(filelist)
+    P_version = np.zeros_like(filelist, dtype='int')
+
+    # Extract the sclock and version for each file and ensure that only one
+    # file per sclock is being read, and that it is the one with the highest version number
+    for i, name in enumerate(filelist):
+        basenames[i] = os.path.basename(name)
+        sclocks[i] = basenames[i][4:13]  # extract the sclock
+        P_version[i] = basenames[i][-5:-4]  # extract the version
+
+    sclocks_unique = np.unique(sclocks)  # find unique sclocks
+    filelist_new = np.array([], dtype='str')
+    for i in sclocks_unique:
+        match = (sclocks == i)  # find all instances with matching sclocks
+        maxP = P_version[match] == max(P_version[match])  # find the highest version among these files
+        filelist_new = np.append(filelist_new, filelist[match][maxP])  # keep only the file with thei highest version
+
+    filelist = filelist_new
+    # Should add a progress bar for importing large numbers of files
+    dt = []
+
+    for i, file in enumerate(filelist):
+        print(file)
+        if is_sav:
+            tmp = CCAM_SAV(file, ave=ave)
+        else:
+            tmp = CCAM_CSV(file, ave=ave)
+        if i == 0:
+            combined = tmp
+        else:
+            # This ensures that rounding errors are not causing mismatches in columns
+            cols1 = list(combined['wvl'].columns)
+            cols2 = list(tmp['wvl'].columns)
+            if set(cols1) == set(cols2):
+                combined = pd.concat([combined, tmp])
+            else:
+                print("Wavelengths don't match!")
+
+    combined.loc[:, ('meta', 'sclock')] = pd.to_numeric(combined.loc[:, ('meta', 'sclock')])
+
+    if lookupfile is not None:
+
+        combined = lookup(combined, lookupfile=lookupfile.replace('[','').replace(']','').replace("'",'').replace(' ','').split(','))
+    if to_csv is not None:
+        combined.to_csv(to_csv)
+    return combined
diff --git a/plio/io/io_ccs.py b/plio/io/io_ccs.py
new file mode 100644
index 0000000..08cd68a
--- /dev/null
+++ b/plio/io/io_ccs.py
@@ -0,0 +1,198 @@
+# This code is used to read individual ChemCam CCS files
+# Header data is stored as attributes of the data frame
+# White space is stripped from the column names
+import os
+import time
+
+import numpy as np
+import pandas as pd
+import scipy.io as io
+
+from plio.utils.utils import lookup
+from plio.utils.utils import file_search
+
+
+def CCAM_CSV(input_data):
+    try:
+        df = pd.read_csv(input_data, header=14, engine='c')
+        cols = list(df.columns.values)
+        df.columns = [i.strip().replace('# ', '') for i in cols]  # strip whitespace from column names
+        df.set_index(['wave'], inplace=True)  # use wavelengths as indices
+        # read the file header and put information into the dataframe as new columns
+        metadata = pd.read_csv(input_data, sep='=', nrows=14, comment=',', engine='c', index_col=0, header=None)
+
+    except:  # handle files with an extra header row containing temperature
+        df = pd.read_csv(input_data, header=15, engine='c')
+        cols = list(df.columns.values)
+        df.columns = [i.strip().replace('# ', '') for i in cols]  # strip whitespace from column names
+        df.set_index(['wave'], inplace=True)  # use wavelengths as indices
+        # read the file header and put information into the dataframe as new columns
+        metadata = pd.read_csv(input_data, sep='=', nrows=15, comment=',', engine='c', index_col=0, header=None)
+
+    df.index = [['wvl'] * len(df.index),
+                df.index.values.round(4)]  # create multiindex so spectra can be easily extracted with a single key
+    df = df.T  # transpose so that each spectrum is a row
+
+    # remove extraneous stuff from the metadataindices
+    metadata.index = [i.strip().strip('# ').replace(' FLOAT', '').lower() for i in metadata.index.values]
+    metadata = metadata.T
+
+    # extract info from the file name
+    fname = os.path.basename(input_data)
+    metadata['sclock'] = fname[4:13]
+    metadata['seqid'] = fname[25:34].upper()
+    metadata['Pversion'] = fname[34:36]
+
+    # duplicate the metadata for each row in the df
+    metadata = metadata.append([metadata] * (len(df.index) - 1), ignore_index=True)
+    metadata.index = df.index  # make the indices match
+    metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values]  # make the columns into multiindex
+    df = pd.concat([metadata, df], axis=1)  # combine the spectra with the metadata
+    return df
+
+
+def CCAM_SAV(input_data, ave=True):
+    # read the IDL .SAV file
+
+    data = io.readsav(input_data, python_dict=True)
+
+    # put the spectra into data frames and combine them
+    df_UV = pd.DataFrame(data['uv'], index=data['defuv'])
+    df_VIS = pd.DataFrame(data['vis'], index=data['defvis'])
+    df_VNIR = pd.DataFrame(data['vnir'], index=data['defvnir'])
+    df_spect = pd.concat([df_UV, df_VIS, df_VNIR])
+    df_spect.columns = ['shot' + str(i + 1) for i in
+                        df_spect.columns]  # add 1 to the columns so they correspond to shot number
+
+    df_aUV = pd.DataFrame(data['auv'], index=data['defuv'], columns=['average'])
+    df_aVIS = pd.DataFrame(data['avis'], index=data['defvis'], columns=['average'])
+    df_aVNIR = pd.DataFrame(data['avnir'], index=data['defvnir'], columns=['average'])
+    df_ave = pd.concat([df_aUV, df_aVIS, df_aVNIR])
+
+    df_mUV = pd.DataFrame(data['muv'], index=data['defuv'], columns=['median'])
+    df_mVIS = pd.DataFrame(data['mvis'], index=data['defvis'], columns=['median'])
+    df_mVNIR = pd.DataFrame(data['mvnir'], index=data['defvnir'], columns=['median'])
+    df_med = pd.concat([df_mUV, df_mVIS, df_mVNIR])
+
+    df = pd.concat([df_spect, df_ave, df_med], axis=1)
+    # create multiindex to access wavelength values
+    # also, round the wavlength values to a more reasonable level of precision
+    df.index = [['wvl'] * len(df.index), df.index.values.round(4)]
+    # transpose so that spectra are rows rather than columns
+    df = df.T
+
+    # extract metadata from the file name and add it to the data frame
+    # use the multiindex label "meta" for all metadata
+
+    fname = os.path.basename(input_data)
+
+    # for some reason, some ChemCam files have the 'darkname' key, others call it 'darkspect'
+    # this try-except pair converts to 'darkname' when needed
+    try:
+        data['darkname']
+    except:
+        data['darkname'] = data['darkspec']
+
+    metadata = [fname,
+                fname[4:13],
+                fname[25:34].upper(),
+                fname[34:36],
+                data['continuumvismin'],
+                data['continuumvnirmin'],
+                data['continuumuvmin'],
+                data['continuumvnirend'],
+                data['distt'],
+                data['darkname'],
+                data['nshots'],
+                data['dnoiseiter'],
+                data['dnoisesig'],
+                data['matchedfilter']]
+    metadata = np.tile(metadata, (len(df.index), 1))
+    metadata_cols = list(zip(['meta'] * len(df.index), ['file',
+                                                        'sclock',
+                                                        'seqid',
+                                                        'Pversion',
+                                                        'continuumvismin',
+                                                        'continuumvnirmin',
+                                                        'continuumuvmin',
+                                                        'continuumvnirend',
+                                                        'distt',
+                                                        'dark',
+                                                        'nshots',
+                                                        'dnoiseiter',
+                                                        'dnoisesig',
+                                                        'matchedfilter']))
+    metadata = pd.DataFrame(metadata, columns=pd.MultiIndex.from_tuples(metadata_cols), index=df.index)
+
+    df = pd.concat([metadata, df], axis=1)
+    if ave == True:
+        df = df.loc['average']
+        df = df.to_frame().T
+    else:
+        pass
+
+    return df
+
+
+def ccam_batch(directory, searchstring='*.csv', to_csv=None, lookupfile=None, ave=True, progressbar=None):
+    # Determine if the file is a .csv or .SAV
+    if '.sav' in searchstring.lower():
+        is_sav = True
+    else:
+        is_sav = False
+    filelist = file_search(directory, searchstring)
+    basenames = np.zeros_like(filelist)
+    sclocks = np.zeros_like(filelist)
+    P_version = np.zeros_like(filelist, dtype='int')
+
+    # Extract the sclock and version for each file and ensure that only one
+    # file per sclock is being read, and that it is the one with the highest version number
+    for i, name in enumerate(filelist):
+        basenames[i] = os.path.basename(name)
+        sclocks[i] = basenames[i][4:13]  # extract the sclock
+        P_version[i] = basenames[i][-5:-4]  # extract the version
+
+    sclocks_unique = np.unique(sclocks)  # find unique sclocks
+    filelist_new = np.array([], dtype='str')
+    for i in sclocks_unique:
+        match = (sclocks == i)  # find all instances with matching sclocks
+        maxP = P_version[match] == max(P_version[match])  # find the highest version among these files
+        filelist_new = np.append(filelist_new, filelist[match][maxP])  # keep only the file with thei highest version
+
+    filelist = filelist_new
+    # Should add a progress bar for importing large numbers of files
+    dt = []
+
+    for i in filelist:
+        print(i)
+        try:
+            if is_sav:
+                t = time.time()
+                tmp = CCAM_SAV(i, ave=ave)
+                dt.append(time.time() - t)
+            else:
+                t = time.time()
+                tmp = CCAM_CSV(i)
+
+                dt.append(time.time() - t)
+            if i == filelist[0]:
+                combined = tmp
+
+            else:
+                # This ensures that rounding errors are not causing mismatches in columns
+                cols1 = list(combined['wvl'].columns)
+                cols2 = list(tmp['wvl'].columns)
+                if set(cols1) == set(cols2):
+                    combined = pd.concat([combined, tmp])
+                else:
+                    print("Wavelengths don't match!")
+        except:
+            pass
+
+    combined.loc[:, ('meta', 'sclock')] = pd.to_numeric(combined.loc[:, ('meta', 'sclock')])
+
+    if lookupfile is not None:
+        combined = lookup(combined, lookupfile=lookupfile)
+    if to_csv is not None:
+        combined.to_csv(to_csv)
+    return combined
diff --git a/plio/io/tests/test_io_ccam_pds.py b/plio/io/tests/test_io_ccam_pds.py
new file mode 100644
index 0000000..03fec02
--- /dev/null
+++ b/plio/io/tests/test_io_ccam_pds.py
@@ -0,0 +1,19 @@
+import os
+import sys
+import unittest
+
+sys.path.insert(0, os.path.abspath('..'))
+
+from plio.examples import get_path
+from plio.io import io_ccam_pds
+
+class Test_CCAM_IO(unittest.TestCase):
+
+    def setUp(self):
+        self.examplefile = get_path('CL5_398645626CCS_F0030004CCAM02013P3.csv')
+
+    def test_14_item_header_csv(self):
+        io_ccam_pds.CCAM_CSV(self.examplefile)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/plio/io/tests/test_io_ccs.py b/plio/io/tests/test_io_ccs.py
new file mode 100644
index 0000000..b2cc814
--- /dev/null
+++ b/plio/io/tests/test_io_ccs.py
@@ -0,0 +1,19 @@
+import os
+import sys
+import unittest
+
+sys.path.insert(0, os.path.abspath('..'))
+
+from plio.examples import get_path
+from plio.io import io_ccs
+
+class Test_CSS_IO(unittest.TestCase):
+
+    def setUp(self):
+        self.examplefile = get_path('CL5_398645626CCS_F0030004CCAM02013P3.csv')
+
+    def test_14_item_header_csv(self):
+        io_ccs.CCAM_CSV(self.examplefile)
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab