Merge pull request #17 from acpaquette/pysat_io

PySAT io to PL io

Merge pull request #17 from acpaquette/pysat_io
b95a55b3 · jlaura · GitHub · a6b9d17b · 8d4d0077 · b95a55b3
Commit b95a55b3 authored 7 years ago by jlaura Committed by GitHub 7 years ago
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,7 +7,7 @@ branches:
 os:
  - linux
  - osx
-  
+
 env:
  - PYTHON_VERSION=3.5
  - PYTHON_VERSION=3.6
@@ -30,24 +30,24 @@ before_install:
  # Create the env
  - conda create -q -n test python=$PYTHON_VERSION
  - source activate test
-  
+
 install:
  - conda config --add channels conda-forge
  - conda config --add channels jlaura
-  - conda install -c conda-forge gdal h5py 
-  - conda install pandas sqlalchemy pyyaml networkx affine protobuf
+  - conda install -c conda-forge gdal h5py
+  - conda install pandas sqlalchemy pyyaml networkx affine protobuf scipy
  - pip install pvl

  # Development installation
  - conda install pytest  pytest-cov sh anaconda-client
-  
+
 script:
  - pytest --cov=plio

 after_success:
  - coveralls
  # Need to do the build in the root
-  - source deactivate 
+  - source deactivate
  - conda install conda-build anaconda-client
  - conda config --set anaconda_upload yes
  - conda build --token $CONDA_UPLOAD_TOKEN --python $PYTHON_VERSION recipe

--- a/appveyor.yml
+++ b/appveyor.yml
@@ -54,7 +54,7 @@ install:
    - cmd: conda config --add channels conda-forge
    - cmd: conda config --add channels jlaura
    - cmd: conda install --yes -c conda-forge gdal h5py
-    - cmd: conda install --yes pandas sqlalchemy pyyaml networkx affine
+    - cmd: conda install --yes pandas sqlalchemy pyyaml networkx affine scipy
    - cmd: conda install --yes -c jlaura protobuf pvl

    # Development installation

--- a/plio/examples/__init__.py
+++ b/plio/examples/__init__.py
@@ -3,7 +3,7 @@ import plio

 __all__ = ['available', 'get_path']

-#Used largely unmodififed from:
+# Used largely unmodififed from:
 # https://github.com/pysal/pysal/blob/master/pysal/examples/__init__.py

 base = os.path.split(plio.__file__)[0]

--- a/plio/io/io_ccam_pds.py
+++ b/plio/io/io_ccam_pds.py
+# This code is used to read individual ChemCam files
+# Header data is stored as attributes of the data frame
+# White space is stripped from the column names
+import os
+
+import numpy as np
+import pandas as pd
+import scipy.io as io
+
+from plio.utils.utils import lookup
+from plio.utils.utils import file_search
+
+
+def CCAM_CSV(input_data, ave=True):
+    try:
+        df = pd.read_csv(input_data, header=14, engine='c')
+        cols = list(df.columns.values)
+        df.columns = [i.strip().replace('# ', '') for i in cols]  # strip whitespace from column names
+        df.set_index(['wave'], inplace=True)  # use wavelengths as indices
+        # read the file header and put information into the dataframe as new columns
+        metadata = pd.read_csv(input_data, sep='=', nrows=14, comment=',', engine='c', index_col=0, header=None)
+    except:
+        try:  # handle files with an extra header row containing temperature
+            df = pd.read_csv(input_data, header=15, engine='c')
+            cols = list(df.columns.values)
+            df.columns = [i.strip().replace('# ', '') for i in cols]  # strip whitespace from column names
+            df.set_index(['wave'], inplace=True)  # use wavelengths as indices
+            # read the file header and put information into the dataframe as new columns
+            metadata = pd.read_csv(input_data, sep='=', nrows=15, comment=',', engine='c', index_col=0, header=None)
+        except:  # handle files with an extra header row containing temperature and target name
+            df = pd.read_csv(input_data, header=16, engine='c')
+            cols = list(df.columns.values)
+            df.columns = [i.strip().replace('# ', '') for i in cols]  # strip whitespace from column names
+            df.set_index(['wave'], inplace=True)  # use wavelengths as indices
+            # read the file header and put information into the dataframe as new columns
+            metadata = pd.read_csv(input_data, sep='=', nrows=16, comment=',', engine='c', index_col=0, header=None)
+
+    if ave:
+        df = pd.DataFrame(df['mean'])
+    else:
+        df = df.drop(['mean', 'median'], axis=1)
+    df.index = [['wvl'] * len(df.index),
+                df.index.values.round(4)]  # create multiindex so spectra can be easily extracted with a single key
+    df = df.T  # transpose so that each spectrum is a row
+
+    # remove extraneous stuff from the metadataindices
+    metadata.index = [i.strip().strip('# ').replace(' FLOAT', '').lower() for i in metadata.index.values]
+    metadata = metadata.T
+
+    # extract info from the file name
+    fname = os.path.basename(input_data)
+    metadata['sclock'] = fname[4:13]
+    metadata['seqid'] = fname[25:34].upper()
+    metadata['Pversion'] = fname[34:36]
+
+    # duplicate the metadata for each row in the df
+    if not ave:
+        metadata = metadata.append([metadata] * (len(df.index) - 1), ignore_index=True)
+    metadata.index = df.index  # make the indices match
+    metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values]  # make the columns into multiindex
+    df = pd.concat([metadata, df], axis=1)  # combine the spectra with the metadata
+    return df
+
+
+def CCAM_SAV(input_data, ave=True):
+    # read the IDL .SAV file
+
+    data = io.readsav(input_data, python_dict=True)
+
+    # put the spectra into data frames and combine them
+    df_UV = pd.DataFrame(data['uv'], index=data['defuv'])
+    df_VIS = pd.DataFrame(data['vis'], index=data['defvis'])
+    df_VNIR = pd.DataFrame(data['vnir'], index=data['defvnir'])
+    df_spect = pd.concat([df_UV, df_VIS, df_VNIR])
+    df_spect.columns = ['shot' + str(i + 1) for i in
+                        df_spect.columns]  # add 1 to the columns so they correspond to shot number
+
+    df_aUV = pd.DataFrame(data['auv'], index=data['defuv'], columns=['average'])
+    df_aVIS = pd.DataFrame(data['avis'], index=data['defvis'], columns=['average'])
+    df_aVNIR = pd.DataFrame(data['avnir'], index=data['defvnir'], columns=['average'])
+    df_ave = pd.concat([df_aUV, df_aVIS, df_aVNIR])
+
+    df_mUV = pd.DataFrame(data['muv'], index=data['defuv'], columns=['median'])
+    df_mVIS = pd.DataFrame(data['mvis'], index=data['defvis'], columns=['median'])
+    df_mVNIR = pd.DataFrame(data['mvnir'], index=data['defvnir'], columns=['median'])
+    df_med = pd.concat([df_mUV, df_mVIS, df_mVNIR])
+
+    df = pd.concat([df_spect, df_ave, df_med], axis=1)
+    # create multiindex to access wavelength values
+    # also, round the wavlength values to a more reasonable level of precision
+    df.index = [['wvl'] * len(df.index), df.index.values.round(4)]
+    # transpose so that spectra are rows rather than columns
+    df = df.T
+
+    # extract metadata from the file name and add it to the data frame
+    # use the multiindex label "meta" for all metadata
+
+    fname = os.path.basename(input_data)
+
+    # for some reason, some ChemCam files have the 'darkname' key, others call it 'darkspect'
+    # this try-except pair converts to 'darkname' when needed
+    try:
+        data['darkname']
+    except:
+        data['darkname'] = data['darkspec']
+
+    metadata = [fname,
+                fname[4:13],
+                fname[25:34].upper(),
+                fname[34:36],
+                data['continuumvismin'],
+                data['continuumvnirmin'],
+                data['continuumuvmin'],
+                data['continuumvnirend'],
+                data['distt'],
+                data['darkname'],
+                data['nshots'],
+                data['dnoiseiter'],
+                data['dnoisesig'],
+                data['matchedfilter']]
+    metadata = np.tile(metadata, (len(df.index), 1))
+    metadata_cols = list(zip(['meta'] * len(df.index), ['file',
+                                                        'sclock',
+                                                        'seqid',
+                                                        'Pversion',
+                                                        'continuumvismin',
+                                                        'continuumvnirmin',
+                                                        'continuumuvmin',
+                                                        'continuumvnirend',
+                                                        'distt',
+                                                        'dark',
+                                                        'nshots',
+                                                        'dnoiseiter',
+                                                        'dnoisesig',
+                                                        'matchedfilter']))
+    metadata = pd.DataFrame(metadata, columns=pd.MultiIndex.from_tuples(metadata_cols), index=df.index)
+
+    df = pd.concat([metadata, df], axis=1)
+    if ave == True:
+        df = df.loc['average']
+        df = df.to_frame().T
+    else:
+        pass
+
+    return df
+
+
+def ccam_batch(directory, searchstring='*.csv', to_csv=None, lookupfile=None, ave=True, progressbar=None):
+    # Determine if the file is a .csv or .SAV
+    if '.sav' in searchstring.lower():
+        is_sav = True
+    else:
+        is_sav = False
+    filelist = file_search(directory, searchstring)
+    basenames = np.zeros_like(filelist)
+    sclocks = np.zeros_like(filelist)
+    P_version = np.zeros_like(filelist, dtype='int')
+
+    # Extract the sclock and version for each file and ensure that only one
+    # file per sclock is being read, and that it is the one with the highest version number
+    for i, name in enumerate(filelist):
+        basenames[i] = os.path.basename(name)
+        sclocks[i] = basenames[i][4:13]  # extract the sclock
+        P_version[i] = basenames[i][-5:-4]  # extract the version
+
+    sclocks_unique = np.unique(sclocks)  # find unique sclocks
+    filelist_new = np.array([], dtype='str')
+    for i in sclocks_unique:
+        match = (sclocks == i)  # find all instances with matching sclocks
+        maxP = P_version[match] == max(P_version[match])  # find the highest version among these files
+        filelist_new = np.append(filelist_new, filelist[match][maxP])  # keep only the file with thei highest version
+
+    filelist = filelist_new
+    # Should add a progress bar for importing large numbers of files
+    dt = []
+
+    for i, file in enumerate(filelist):
+        print(file)
+        if is_sav:
+            tmp = CCAM_SAV(file, ave=ave)
+        else:
+            tmp = CCAM_CSV(file, ave=ave)
+        if i == 0:
+            combined = tmp
+        else:
+            # This ensures that rounding errors are not causing mismatches in columns
+            cols1 = list(combined['wvl'].columns)
+            cols2 = list(tmp['wvl'].columns)
+            if set(cols1) == set(cols2):
+                combined = pd.concat([combined, tmp])
+            else:
+                print("Wavelengths don't match!")
+
+    combined.loc[:, ('meta', 'sclock')] = pd.to_numeric(combined.loc[:, ('meta', 'sclock')])
+
+    if lookupfile is not None:
+
+        combined = lookup(combined, lookupfile=lookupfile.replace('[','').replace(']','').replace("'",'').replace(' ','').split(','))
+    if to_csv is not None:
+        combined.to_csv(to_csv)
+    return combined
--- a/plio/io/io_edr.py
+++ b/plio/io/io_edr.py
+import os
+
+import numpy as np
+import pandas as pd
+
+
+def EDR(input_file):
+    f = open(input_file, 'rb')  # read as bytes so python won't complain about the binary part of the file
+
+    # read lines of the header until reaching the end of the libs table (collecting other metadata along the way)
+    end_of_libs_table = False
+    while end_of_libs_table is False:
+        line = str(f.readline(), 'utf-8').replace('\r', '').replace('\n',
+                                                                    '')  # convert the current line to a string and get rid of newline characters
+        line = line.split('=')  # split the line on equals sign if present
+        # look for the name of the value we want, if the current line has it, then set the value
+        if 'RECORD_BYTES' in line[0]:
+            rbytes = int(line[1])
+        if 'LABEL_RECORDS' in line[0]:
+            lrecs = int(line[1])
+        if 'SPACECRAFT_CLOCK_START_COUNT' in line[0]:
+            sclock = int(line[1].replace('"', '').split('.')[0])
+        if 'SEQUENCE_ID' in line[0]:
+            seqID = line[1].replace('"', '')
+        if 'INSTRUMENT_FOCUS_DISTANCE' in line[0]:
+            focus_dist = int(line[1])
+
+        if 'INSTRUMENT_TEMPERATURE' in line[0]:
+            instrument_temps = line[1] \
+                               + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
+                               + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
+                               + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '')
+            instrument_temps = [float(i) for i in
+                                instrument_temps.replace('<degC>', '').replace('(', '').replace(')', '').replace(' ',
+                                                                                                                 '').split(
+                                    ',')]
+            instrument_temps_name = str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '')
+            instrument_temps_name = instrument_temps_name.split('=')[1] \
+                                    + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
+                                    + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
+                                    + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
+                                    + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '')
+            instrument_temps_name = instrument_temps_name.replace(' ', '').replace('(', '').replace(')', '').replace(
+                '"', '').split(',')
+            f.readline()
+            pass
+        try:
+            if 'CCAM_LIBS_DATA_CONTAINER' in line[1]:
+                nshots = int(str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '').split('=')[1])
+                start_byte = int(str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '').split('=')[1])
+            if 'END_OBJECT' in line[0] and 'CCAM_LIBS_TABLE' in line[1]:
+                end_of_libs_table = True
+        except:
+            pass
+
+    f.close()
+    header_skip = lrecs * rbytes  # calculate the number of header bytes to skip to get to the real data
+
+    with open(input_file, "rb") as f:
+        f.seek(header_skip + start_byte - 1, 0)
+        spectra = []
+        while spectra.__len__() < nshots:
+            spectrum = []
+            while spectrum.__len__() < 6444:
+                spectrum.append(int.from_bytes(f.read(2), byteorder='big', signed=False))
+            spectra.append(spectrum)
+    spectra = np.array(spectra, dtype='int')
+    cols = np.array(list(range(spectra.shape[1]))) + 1
+    cols = [('channel', i) for i in cols]
+    inds = np.array(list(range(spectra.shape[0]))) + 1
+    sp = pd.DataFrame(spectra, columns=pd.MultiIndex.from_tuples(cols), index=inds)
+    sp[('meta', 'EDR_file')] = os.path.basename(input_file)
+    sp[('meta', 'Spacecraft_Clock')] = sclock
+    sp[('meta', 'Shot')] = sp.index
+    sp[('meta', 'SeqID')] = seqID
+    sp[('meta', 'Focus_Distance')] = focus_dist
+    for ind, name in enumerate(instrument_temps_name):
+        sp[('meta', name + '_temp')] = instrument_temps[ind]
+    sp.to_csv('test.csv')
+    return sp
--- a/plio/io/io_jsc.py
+++ b/plio/io/io_jsc.py
+import os
+
+import numpy as np
+import pandas as pd
+from pandas.core.common import array_equivalent
+
+from plio.utils.utils import file_search
+
+
+# This function reads the lookup tables used to expand metadata from the file names
+# This is separated from parsing the filenames so that for large lists of files the
+# lookup tables don't need to be read over and over
+#
+# Info in the tables is stored in a dict of dataframes so that only one variable
+# (the dict) needs to be passed between functions
+def read_refdata(LUT_files):
+    ID_info = pd.read_csv(LUT_files['ID'], index_col=0)
+    spectrometer_info = pd.read_csv(LUT_files['spect'], index_col=0)
+    # spectrometer_info.reset_index(inplace=True)
+    laser_info = pd.read_csv(LUT_files['laser'], index_col=0)
+    # laser_info.reset_index(inplace=True)
+    exp_info = pd.read_csv(LUT_files['exp'], index_col=0)
+    # exp_info.reset_index(inplace=True)
+    sample_info = pd.read_csv(LUT_files['sample'], index_col=0)
+    # sample_info.reset_index(inplace=True)
+    refdata = {'spect': spectrometer_info, 'laser': laser_info, 'exp': exp_info, 'sample': sample_info, 'ID': ID_info}
+    return refdata
+
+
+# This function parses the file names to record metadata related to the observation
+def jsc_filename_parse(filename, refdata):
+    filename = os.path.basename(filename)  # strip the path off of the file name
+    filename = filename.split('_')  # split the file name on underscores
+    libs_ID = filename[0]
+    laserID = filename[4][0]
+    expID = filename[5]
+    spectID = filename[6]
+
+    try:
+        sampleID = refdata['ID'].loc[libs_ID].values[0]
+        file_info = pd.DataFrame(refdata['sample'].loc[sampleID])
+        if file_info.columns.shape[0] < file_info.index.shape[0]:
+            file_info = file_info.T
+        if file_info.index.shape[0] > 1:
+            print('More than one matching row for ' + sampleID + '!')
+            tempID = 'Unknown'
+            file_info = pd.DataFrame(refdata['sample'].loc[tempID])
+            if file_info.columns.shape[0] < file_info.index.shape[0]:
+                file_info = file_info.T
+
+
+    except:
+        sampleID = 'Unknown'
+        file_info = pd.DataFrame(refdata['sample'].loc[sampleID])
+        if file_info.columns.shape[0] < file_info.index.shape[0]:
+            file_info = file_info.T
+
+    file_info['Sample ID'] = sampleID
+    file_info['LIBS ID'] = libs_ID
+    file_info.reset_index(level=0, inplace=True, drop=True)
+    file_info['loc'] = int(filename[1])
+    file_info['lab'] = filename[2]
+    file_info['gas'] = filename[3][0]
+    file_info['pressure'] = float(filename[3][1:])
+
+    if laserID in refdata['laser'].index:
+        laser_info = pd.DataFrame(refdata['laser'].loc[laserID]).T
+        laser_info.index.name = 'Laser Identifier'
+        laser_info.reset_index(level=0, inplace=True)
+        file_info = pd.concat([file_info, laser_info], axis=1)
+
+    file_info['laser_power'] = float(filename[4][1:])
+    if expID in refdata['exp'].index:
+        exp_info = pd.DataFrame(refdata['exp'].loc[expID]).T
+        exp_info.index.name = 'Exp Identifier'
+        exp_info.reset_index(level=0, inplace=True)
+        file_info = pd.concat([file_info, exp_info], axis=1)
+
+    file_info['spectrometer'] = spectID
+    if spectID in refdata['spect'].index:
+        temp = refdata['spect'].loc[spectID]
+        temp = [temp[2], temp[4:]]
+        spect_info = pd.DataFrame(refdata['spect'].loc[spectID]).T
+        spect_info.index.name = 'Spectrometer Identifier'
+        spect_info.reset_index(level=0, inplace=True)
+        file_info = pd.concat([file_info, spect_info], axis=1)
+
+    return file_info
+
+
+def JSC(input_files, refdata):
+    try:
+        # read the first file
+        data = pd.read_csv(input_files[0], skiprows=14, sep='\t', engine='c')
+        data = data.rename(columns={data.columns[0]: 'time1', data.columns[1]: 'time2'})
+        metadata = pd.concat([jsc_filename_parse(input_files[0], refdata)] * len(data.index))
+        metadata.drop('spectrometer', axis=1, inplace=True)
+
+        # read the next files and merge them with the first
+        for file in input_files[1:]:
+            datatemp = pd.read_csv(file, skiprows=14, sep='\t', engine='c')
+            datatemp = datatemp.rename(columns={datatemp.columns[0]: 'time1', datatemp.columns[1]: 'time2'})
+            data = data.merge(datatemp)
+
+        time = data[['time1', 'time2']]  # split the two time columns from the data frame
+        data.drop(['time1', 'time2'], axis=1, inplace=True)  # trim the data frame so it is just the spectra
+
+        # make a multiindex for each wavlength column so they can be easily isolated from metadata later
+        data.columns = [['wvl'] * len(data.columns), np.array(data.columns.values, dtype='float').round(4)]
+
+        metadata.index = data.index
+        metadata = pd.concat([metadata, time], axis=1)
+        compcols = ['SiO2', 'TiO2', 'Al2O3', 'Cr2O3', 'Fe2O3T', 'MnO', 'MgO', 'CaO', 'Na2O', 'K2O', 'P2O5',
+                    'SO3 LOI Residue', 'Total', 'Total Includes', '%LOI', 'FeO',
+                    'Fe2O3', 'SO3 Actual', 'Fe(3+)/Fe(Total)', 'Rb (ug/g)', 'Sr (ug/g)', 'Y (ug/g)', 'Zr (ug/g)',
+                    'V (ug/g)', 'Ni (ug/g)', 'Cr (ug/g)',
+                    'Nb (ug/g)', 'Ga (ug/g)', 'Cu (ug/g)', 'Zn (ug/g)', 'Co (ug/g)', 'Ba (ug/g)', 'La (ug/g)',
+                    'Ce (ug/g)', 'U (ug/g)', 'Th (ug/g)', 'Sc (ug/g)',
+                    'Pb (ug/g)', 'Ge (ug/g)', 'As (ug/g)', 'Cl (ug/g)']
+        compdata = metadata[compcols]
+        metadata.drop(compcols, axis=1, inplace=True)
+        metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values]
+        compdata.columns = [['comp'] * len(compdata.columns), compdata.columns.values]
+        data = pd.concat([data, metadata, compdata], axis=1)
+
+        data[('meta', 'Scan #')] = data.index
+        data.set_index(('meta', 'time2'), drop=False, inplace=True)
+
+        return data
+    except:
+        print('Problem reading:' + input_file)
+        print('Moving to Problem_Files')
+        os.rename(input_file,
+                  r"Problem_Files\\" + os.path.basename(
+                      input_file))
+        return None
+
+
+def jsc_batch(directory, LUT_files, searchstring='*.txt', to_csv=None):
+    # Read in the lookup tables to expand filename metadata
+    refdata = read_refdata(LUT_files)
+    # get the list of files that match the search string in the given directory
+    filelist = file_search(directory, searchstring)
+    spectIDs = []  # create an empty list to hold the spectrometer IDs
+    libsIDs = []
+    timestamps = []
+    locs = []
+    for file in filelist:
+        filesplit = os.path.basename(file).split('_')
+        spectIDs.append(filesplit[6])  # get the spectrometer IDs for each file in the list
+        libsIDs.append(filesplit[0])
+        timestamps.append(filesplit[-1].split('.')[0])
+        locs.append(filesplit[1])
+    spectIDs_unique = np.unique(spectIDs)  # get the unique spectrometer IDs
+    libsIDs_unique = np.unique(libsIDs)
+    dfs = []  # create an empty list to hold the data frames for each spectrometer
+
+    # loop through each LIBS ID
+    alldata = []
+    for ID in libsIDs_unique:
+        print('Working on : ' + str(ID))
+        sublist = filelist[np.in1d(libsIDs, ID)]
+        locs = []
+        for file in sublist:
+            locs.append(os.path.basename(file).split('_')[1])
+        locs_unique = np.unique(locs)
+        # loop through each location for that libs ID
+        for loc in locs_unique:
+            print(loc)
+            sub_sublist = sublist[np.in1d(locs, loc)]  # get the files for that LIBSID and location
+            data = JSC(sub_sublist, refdata)
+            alldata.append(data)
+            pass
+
+    combined = pd.concat(alldata)
+    if to_csv is not None:
+        print('Writing combined data to: ' + to_csv)
+        combined.to_csv(to_csv)
+    return combined
+
+
+# got this function from stack overflow: http://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
+# it's slow but doesn't crash python like combined.T.drop_duplicates().T does in some cases with very large sets of data
+def duplicate_columns(frame):
+    groups = frame.columns.to_series().groupby(frame.dtypes).groups
+    dups = []
+
+    for t, v in groups.items():
+
+        cs = frame[v].columns
+        vs = frame[v]
+        lcs = len(cs)
+
+        for i in range(lcs):
+            ia = vs.iloc[:, i].values
+            for j in range(i + 1, lcs):
+                ja = vs.iloc[:, j].values
+                if array_equivalent(ia, ja):
+                    dups.append(cs[i])
+                    break
+
+    return dups
--- a/plio/io/io_moon_minerology_mapper.py
+++ b/plio/io/io_moon_minerology_mapper.py
+import numpy as np
+from osgeo import gdal
+
+
+def openm3(input_data):
+    if input_data.split('.')[-1] == 'hdr':
+        # GDAL wants the img, but many users aim at the .hdr
+        input_data = input_data.split('.')[0] + '.img'
+    ds = gdal.Open(input_data)
+    ref_array = ds.GetRasterBand(1).ReadAsArray()
+    metadata = ds.GetMetadata()
+    wv_array = metadatatoband(metadata)
+    return wv_array, ref_array, ds
+
+
+def metadatatoband(metadata):
+    wv2band = []
+    for k, v in metadata.iteritems():
+        try:
+            wv2band.append(float(value))
+        except:
+            v = v.split(" ")[-1].split("(")[1].split(")")[0]
+            wv2band.append(float(v))
+    wv2band.sort(key=int)
+    return np.asarray(wv2band)
--- a/plio/io/io_multibandimager.py
+++ b/plio/io/io_multibandimager.py
+import numpy as np
+from osgeo import gdal
+
+
+def openmi(input_data):
+    ds = gdal.Open(input_data)
+    band_pointers = []
+    nbands = ds.RasterCount
+
+    for b in xrange(1, nbands + 1):
+        band_pointers.append(ds.GetRasterBand(b))
+
+    ref_array = ds.GetRasterBand(1).ReadAsArray()
+    wv_array = None
+    return wv_array, ref_array[::3, ::3], ds
+
+
+def getspectra(x, y, ds):
+    nbands = ds.RasterCount
+    reflectance = np.empty(nbands)
+    for b in range(1, nbands + 1):
+        reflectance[b - 1] = ds.GetRasterBand(b).ReadAsArray(y, x, 1, 1)
+
+    mergedref = np.empty(nbands - 1)
+    mergedref[:4] = reflectance[:4]
+    mergedref[4] = (reflectance[4] + reflectance[5]) / 2
+    mergedref[5:] = reflectance[6:]
+    return mergedref
--- a/plio/io/io_yaml.py
+++ b/plio/io/io_yaml.py
-try:
-    import yaml
-except:
-    print('YAML package not installed, disabling yaml_io module')
+import yaml


 def read_yaml(inputfile):
@@ -21,6 +18,6 @@ def read_yaml(inputfile):
    try:
        with open(inputfile, 'r') as f:
            ydict = yaml.load(f)
-    except: # pragma: no cover
+    except:  # pragma: no cover
        raise IOError('Unable to load YAML file.')
    return ydict
--- a/plio/io/tests/test_io_ccam_pds.py
+++ b/plio/io/tests/test_io_ccam_pds.py
+import os
+import sys
+import unittest
+
+sys.path.insert(0, os.path.abspath('..'))
+
+from plio.examples import get_path
+from plio.io import io_ccam_pds
+
+class Test_CCAM_IO(unittest.TestCase):
+
+    def setUp(self):
+        self.examplefile = get_path('CL5_398645626CCS_F0030004CCAM02013P3.csv')
+
+    def test_14_item_header_csv(self):
+        io_ccam_pds.CCAM_CSV(self.examplefile)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/plio/io/tests/test_io_controlnetwork.py
+++ b/plio/io/tests/test_io_controlnetwork.py
@@ -6,8 +6,8 @@ from time import strftime, gmtime
 import pandas as pd
 import pvl

-from .. import io_controlnetwork
-from .. import ControlNetFileV0002_pb2 as cnf
+from plio.io import io_controlnetwork
+from plio.io import ControlNetFileV0002_pb2 as cnf
 from plio.utils.utils import find_in_dict

 sys.path.insert(0, os.path.abspath('..'))

--- a/plio/io/tests/test_io_edr.py
+++ b/plio/io/tests/test_io_edr.py
+import unittest
+
+from plio.examples import get_path
+from plio.io import io_edr
+
+
+# class Test_Tes_IO(unittest.TestCase):
+#
+#     # Need different test data or need to modify the current code
+#     def setUp(self):
+#         self.examplefile = get_path('cl5_398736801edr_f0030004ccam01014m1.dat')
+#     #
+#     def test_open(self):
+#         ds = io_edr.EDR(self.examplefile)
+#
+# if __name__ == '__main__':
+#     unittest.main()
--- a/plio/io/tests/test_structured_io.py
+++ b/plio/io/tests/test_structured_io.py
 import unittest

-from .. import io_json
-from .. import io_yaml
+from plio.io import io_json
+from plio.io import io_yaml

 try:
    import yaml

--- a/plio/utils/utils.py
+++ b/plio/utils/utils.py
@@ -5,6 +5,7 @@ import os
 import fnmatch
 import shutil
 import tempfile
+import pandas as pd


 def create_dir(basedir=''):
@@ -50,30 +51,6 @@ def file_to_list(file):
    return list(file_list)


-def create_dir(basedir=''):
-    """
-    Create a unique, temporary directory in /tmp where processing will occur
-
-    Parameters
-    ----------
-    basedir : str
-              The PATH to create the temporary directory in.
-    """
-    return tempfile.mkdtemp(dir=basedir)
-
-
-def delete_dir(dir):
-    """
-    Delete a directory
-
-    Parameters
-    ----------
-    dir : str
-          Remove a directory
-    """
-    shutil.rmtree(dir)
-
-
 def file_search(searchdir,searchstring):
    """
    Recursively search for files in the specified directory
@@ -164,3 +141,31 @@ def xstr(s):
    if s is None:
        return ''
    return str(s)
+
+def lookup(df, lookupfile=None, lookupdf=None, sep=',', skiprows=1, left_on='sclock', right_on='Spacecraft Clock'):
+#TODO: automatically determine the number of rows to skip to handle ccam internal master list and PDS "official" master list formats
+    if lookupfile is not None:
+        # this loop concatenates together multiple lookup files if provided
+        # (mostly to handle the three different master lists for chemcam)
+        for x in lookupfile:
+            try:
+                tmp = pd.read_csv(x, sep=sep, skiprows=skiprows, error_bad_lines=False)
+                lookupdf = pd.concat([lookupdf, tmp])
+            except:
+                lookupdf = pd.read_csv(x, sep=sep, skiprows=skiprows, error_bad_lines=False)
+    metadata = df['meta']
+
+    metadata = metadata.merge(lookupdf, left_on=left_on, right_on=right_on, how='left')
+
+    # remove metadata columns that already exist in the data frame to avoid non-unique columns
+    meta_cols = set(metadata.columns.values)
+    meta_cols_keep = list(meta_cols - set(df['meta'].columns.values))
+    metadata = metadata[meta_cols_keep]
+
+    # make metadata into a multiindex
+    metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values]
+    # give it the same indices as the df
+    metadata.index = df.index
+    # combine the df and the new metadata
+    df = pd.concat([metadata, df], axis=1)
+    return df