Skip to content
Snippets Groups Projects
Commit b95a55b3 authored by jlaura's avatar jlaura Committed by GitHub
Browse files

Merge pull request #17 from acpaquette/pysat_io

PySAT io to PL io
parents a6b9d17b 8d4d0077
No related branches found
No related tags found
No related merge requests found
......@@ -7,7 +7,7 @@ branches:
os:
- linux
- osx
env:
- PYTHON_VERSION=3.5
- PYTHON_VERSION=3.6
......@@ -30,24 +30,24 @@ before_install:
# Create the env
- conda create -q -n test python=$PYTHON_VERSION
- source activate test
install:
- conda config --add channels conda-forge
- conda config --add channels jlaura
- conda install -c conda-forge gdal h5py
- conda install pandas sqlalchemy pyyaml networkx affine protobuf
- conda install -c conda-forge gdal h5py
- conda install pandas sqlalchemy pyyaml networkx affine protobuf scipy
- pip install pvl
# Development installation
- conda install pytest pytest-cov sh anaconda-client
script:
- pytest --cov=plio
after_success:
- coveralls
# Need to do the build in the root
- source deactivate
- source deactivate
- conda install conda-build anaconda-client
- conda config --set anaconda_upload yes
- conda build --token $CONDA_UPLOAD_TOKEN --python $PYTHON_VERSION recipe
......
......@@ -54,7 +54,7 @@ install:
- cmd: conda config --add channels conda-forge
- cmd: conda config --add channels jlaura
- cmd: conda install --yes -c conda-forge gdal h5py
- cmd: conda install --yes pandas sqlalchemy pyyaml networkx affine
- cmd: conda install --yes pandas sqlalchemy pyyaml networkx affine scipy
- cmd: conda install --yes -c jlaura protobuf pvl
# Development installation
......
......@@ -3,7 +3,7 @@ import plio
__all__ = ['available', 'get_path']
#Used largely unmodififed from:
# Used largely unmodififed from:
# https://github.com/pysal/pysal/blob/master/pysal/examples/__init__.py
base = os.path.split(plio.__file__)[0]
......
# This code is used to read individual ChemCam files
# Header data is stored as attributes of the data frame
# White space is stripped from the column names
import os
import numpy as np
import pandas as pd
import scipy.io as io
from plio.utils.utils import lookup
from plio.utils.utils import file_search
def CCAM_CSV(input_data, ave=True):
try:
df = pd.read_csv(input_data, header=14, engine='c')
cols = list(df.columns.values)
df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names
df.set_index(['wave'], inplace=True) # use wavelengths as indices
# read the file header and put information into the dataframe as new columns
metadata = pd.read_csv(input_data, sep='=', nrows=14, comment=',', engine='c', index_col=0, header=None)
except:
try: # handle files with an extra header row containing temperature
df = pd.read_csv(input_data, header=15, engine='c')
cols = list(df.columns.values)
df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names
df.set_index(['wave'], inplace=True) # use wavelengths as indices
# read the file header and put information into the dataframe as new columns
metadata = pd.read_csv(input_data, sep='=', nrows=15, comment=',', engine='c', index_col=0, header=None)
except: # handle files with an extra header row containing temperature and target name
df = pd.read_csv(input_data, header=16, engine='c')
cols = list(df.columns.values)
df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names
df.set_index(['wave'], inplace=True) # use wavelengths as indices
# read the file header and put information into the dataframe as new columns
metadata = pd.read_csv(input_data, sep='=', nrows=16, comment=',', engine='c', index_col=0, header=None)
if ave:
df = pd.DataFrame(df['mean'])
else:
df = df.drop(['mean', 'median'], axis=1)
df.index = [['wvl'] * len(df.index),
df.index.values.round(4)] # create multiindex so spectra can be easily extracted with a single key
df = df.T # transpose so that each spectrum is a row
# remove extraneous stuff from the metadataindices
metadata.index = [i.strip().strip('# ').replace(' FLOAT', '').lower() for i in metadata.index.values]
metadata = metadata.T
# extract info from the file name
fname = os.path.basename(input_data)
metadata['sclock'] = fname[4:13]
metadata['seqid'] = fname[25:34].upper()
metadata['Pversion'] = fname[34:36]
# duplicate the metadata for each row in the df
if not ave:
metadata = metadata.append([metadata] * (len(df.index) - 1), ignore_index=True)
metadata.index = df.index # make the indices match
metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values] # make the columns into multiindex
df = pd.concat([metadata, df], axis=1) # combine the spectra with the metadata
return df
def CCAM_SAV(input_data, ave=True):
# read the IDL .SAV file
data = io.readsav(input_data, python_dict=True)
# put the spectra into data frames and combine them
df_UV = pd.DataFrame(data['uv'], index=data['defuv'])
df_VIS = pd.DataFrame(data['vis'], index=data['defvis'])
df_VNIR = pd.DataFrame(data['vnir'], index=data['defvnir'])
df_spect = pd.concat([df_UV, df_VIS, df_VNIR])
df_spect.columns = ['shot' + str(i + 1) for i in
df_spect.columns] # add 1 to the columns so they correspond to shot number
df_aUV = pd.DataFrame(data['auv'], index=data['defuv'], columns=['average'])
df_aVIS = pd.DataFrame(data['avis'], index=data['defvis'], columns=['average'])
df_aVNIR = pd.DataFrame(data['avnir'], index=data['defvnir'], columns=['average'])
df_ave = pd.concat([df_aUV, df_aVIS, df_aVNIR])
df_mUV = pd.DataFrame(data['muv'], index=data['defuv'], columns=['median'])
df_mVIS = pd.DataFrame(data['mvis'], index=data['defvis'], columns=['median'])
df_mVNIR = pd.DataFrame(data['mvnir'], index=data['defvnir'], columns=['median'])
df_med = pd.concat([df_mUV, df_mVIS, df_mVNIR])
df = pd.concat([df_spect, df_ave, df_med], axis=1)
# create multiindex to access wavelength values
# also, round the wavlength values to a more reasonable level of precision
df.index = [['wvl'] * len(df.index), df.index.values.round(4)]
# transpose so that spectra are rows rather than columns
df = df.T
# extract metadata from the file name and add it to the data frame
# use the multiindex label "meta" for all metadata
fname = os.path.basename(input_data)
# for some reason, some ChemCam files have the 'darkname' key, others call it 'darkspect'
# this try-except pair converts to 'darkname' when needed
try:
data['darkname']
except:
data['darkname'] = data['darkspec']
metadata = [fname,
fname[4:13],
fname[25:34].upper(),
fname[34:36],
data['continuumvismin'],
data['continuumvnirmin'],
data['continuumuvmin'],
data['continuumvnirend'],
data['distt'],
data['darkname'],
data['nshots'],
data['dnoiseiter'],
data['dnoisesig'],
data['matchedfilter']]
metadata = np.tile(metadata, (len(df.index), 1))
metadata_cols = list(zip(['meta'] * len(df.index), ['file',
'sclock',
'seqid',
'Pversion',
'continuumvismin',
'continuumvnirmin',
'continuumuvmin',
'continuumvnirend',
'distt',
'dark',
'nshots',
'dnoiseiter',
'dnoisesig',
'matchedfilter']))
metadata = pd.DataFrame(metadata, columns=pd.MultiIndex.from_tuples(metadata_cols), index=df.index)
df = pd.concat([metadata, df], axis=1)
if ave == True:
df = df.loc['average']
df = df.to_frame().T
else:
pass
return df
def ccam_batch(directory, searchstring='*.csv', to_csv=None, lookupfile=None, ave=True, progressbar=None):
# Determine if the file is a .csv or .SAV
if '.sav' in searchstring.lower():
is_sav = True
else:
is_sav = False
filelist = file_search(directory, searchstring)
basenames = np.zeros_like(filelist)
sclocks = np.zeros_like(filelist)
P_version = np.zeros_like(filelist, dtype='int')
# Extract the sclock and version for each file and ensure that only one
# file per sclock is being read, and that it is the one with the highest version number
for i, name in enumerate(filelist):
basenames[i] = os.path.basename(name)
sclocks[i] = basenames[i][4:13] # extract the sclock
P_version[i] = basenames[i][-5:-4] # extract the version
sclocks_unique = np.unique(sclocks) # find unique sclocks
filelist_new = np.array([], dtype='str')
for i in sclocks_unique:
match = (sclocks == i) # find all instances with matching sclocks
maxP = P_version[match] == max(P_version[match]) # find the highest version among these files
filelist_new = np.append(filelist_new, filelist[match][maxP]) # keep only the file with thei highest version
filelist = filelist_new
# Should add a progress bar for importing large numbers of files
dt = []
for i, file in enumerate(filelist):
print(file)
if is_sav:
tmp = CCAM_SAV(file, ave=ave)
else:
tmp = CCAM_CSV(file, ave=ave)
if i == 0:
combined = tmp
else:
# This ensures that rounding errors are not causing mismatches in columns
cols1 = list(combined['wvl'].columns)
cols2 = list(tmp['wvl'].columns)
if set(cols1) == set(cols2):
combined = pd.concat([combined, tmp])
else:
print("Wavelengths don't match!")
combined.loc[:, ('meta', 'sclock')] = pd.to_numeric(combined.loc[:, ('meta', 'sclock')])
if lookupfile is not None:
combined = lookup(combined, lookupfile=lookupfile.replace('[','').replace(']','').replace("'",'').replace(' ','').split(','))
if to_csv is not None:
combined.to_csv(to_csv)
return combined
import os
import numpy as np
import pandas as pd
def EDR(input_file):
f = open(input_file, 'rb') # read as bytes so python won't complain about the binary part of the file
# read lines of the header until reaching the end of the libs table (collecting other metadata along the way)
end_of_libs_table = False
while end_of_libs_table is False:
line = str(f.readline(), 'utf-8').replace('\r', '').replace('\n',
'') # convert the current line to a string and get rid of newline characters
line = line.split('=') # split the line on equals sign if present
# look for the name of the value we want, if the current line has it, then set the value
if 'RECORD_BYTES' in line[0]:
rbytes = int(line[1])
if 'LABEL_RECORDS' in line[0]:
lrecs = int(line[1])
if 'SPACECRAFT_CLOCK_START_COUNT' in line[0]:
sclock = int(line[1].replace('"', '').split('.')[0])
if 'SEQUENCE_ID' in line[0]:
seqID = line[1].replace('"', '')
if 'INSTRUMENT_FOCUS_DISTANCE' in line[0]:
focus_dist = int(line[1])
if 'INSTRUMENT_TEMPERATURE' in line[0]:
instrument_temps = line[1] \
+ str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
+ str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
+ str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '')
instrument_temps = [float(i) for i in
instrument_temps.replace('<degC>', '').replace('(', '').replace(')', '').replace(' ',
'').split(
',')]
instrument_temps_name = str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '')
instrument_temps_name = instrument_temps_name.split('=')[1] \
+ str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
+ str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
+ str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
+ str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '')
instrument_temps_name = instrument_temps_name.replace(' ', '').replace('(', '').replace(')', '').replace(
'"', '').split(',')
f.readline()
pass
try:
if 'CCAM_LIBS_DATA_CONTAINER' in line[1]:
nshots = int(str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '').split('=')[1])
start_byte = int(str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '').split('=')[1])
if 'END_OBJECT' in line[0] and 'CCAM_LIBS_TABLE' in line[1]:
end_of_libs_table = True
except:
pass
f.close()
header_skip = lrecs * rbytes # calculate the number of header bytes to skip to get to the real data
with open(input_file, "rb") as f:
f.seek(header_skip + start_byte - 1, 0)
spectra = []
while spectra.__len__() < nshots:
spectrum = []
while spectrum.__len__() < 6444:
spectrum.append(int.from_bytes(f.read(2), byteorder='big', signed=False))
spectra.append(spectrum)
spectra = np.array(spectra, dtype='int')
cols = np.array(list(range(spectra.shape[1]))) + 1
cols = [('channel', i) for i in cols]
inds = np.array(list(range(spectra.shape[0]))) + 1
sp = pd.DataFrame(spectra, columns=pd.MultiIndex.from_tuples(cols), index=inds)
sp[('meta', 'EDR_file')] = os.path.basename(input_file)
sp[('meta', 'Spacecraft_Clock')] = sclock
sp[('meta', 'Shot')] = sp.index
sp[('meta', 'SeqID')] = seqID
sp[('meta', 'Focus_Distance')] = focus_dist
for ind, name in enumerate(instrument_temps_name):
sp[('meta', name + '_temp')] = instrument_temps[ind]
sp.to_csv('test.csv')
return sp
import os
import numpy as np
import pandas as pd
from pandas.core.common import array_equivalent
from plio.utils.utils import file_search
# This function reads the lookup tables used to expand metadata from the file names
# This is separated from parsing the filenames so that for large lists of files the
# lookup tables don't need to be read over and over
#
# Info in the tables is stored in a dict of dataframes so that only one variable
# (the dict) needs to be passed between functions
def read_refdata(LUT_files):
ID_info = pd.read_csv(LUT_files['ID'], index_col=0)
spectrometer_info = pd.read_csv(LUT_files['spect'], index_col=0)
# spectrometer_info.reset_index(inplace=True)
laser_info = pd.read_csv(LUT_files['laser'], index_col=0)
# laser_info.reset_index(inplace=True)
exp_info = pd.read_csv(LUT_files['exp'], index_col=0)
# exp_info.reset_index(inplace=True)
sample_info = pd.read_csv(LUT_files['sample'], index_col=0)
# sample_info.reset_index(inplace=True)
refdata = {'spect': spectrometer_info, 'laser': laser_info, 'exp': exp_info, 'sample': sample_info, 'ID': ID_info}
return refdata
# This function parses the file names to record metadata related to the observation
def jsc_filename_parse(filename, refdata):
filename = os.path.basename(filename) # strip the path off of the file name
filename = filename.split('_') # split the file name on underscores
libs_ID = filename[0]
laserID = filename[4][0]
expID = filename[5]
spectID = filename[6]
try:
sampleID = refdata['ID'].loc[libs_ID].values[0]
file_info = pd.DataFrame(refdata['sample'].loc[sampleID])
if file_info.columns.shape[0] < file_info.index.shape[0]:
file_info = file_info.T
if file_info.index.shape[0] > 1:
print('More than one matching row for ' + sampleID + '!')
tempID = 'Unknown'
file_info = pd.DataFrame(refdata['sample'].loc[tempID])
if file_info.columns.shape[0] < file_info.index.shape[0]:
file_info = file_info.T
except:
sampleID = 'Unknown'
file_info = pd.DataFrame(refdata['sample'].loc[sampleID])
if file_info.columns.shape[0] < file_info.index.shape[0]:
file_info = file_info.T
file_info['Sample ID'] = sampleID
file_info['LIBS ID'] = libs_ID
file_info.reset_index(level=0, inplace=True, drop=True)
file_info['loc'] = int(filename[1])
file_info['lab'] = filename[2]
file_info['gas'] = filename[3][0]
file_info['pressure'] = float(filename[3][1:])
if laserID in refdata['laser'].index:
laser_info = pd.DataFrame(refdata['laser'].loc[laserID]).T
laser_info.index.name = 'Laser Identifier'
laser_info.reset_index(level=0, inplace=True)
file_info = pd.concat([file_info, laser_info], axis=1)
file_info['laser_power'] = float(filename[4][1:])
if expID in refdata['exp'].index:
exp_info = pd.DataFrame(refdata['exp'].loc[expID]).T
exp_info.index.name = 'Exp Identifier'
exp_info.reset_index(level=0, inplace=True)
file_info = pd.concat([file_info, exp_info], axis=1)
file_info['spectrometer'] = spectID
if spectID in refdata['spect'].index:
temp = refdata['spect'].loc[spectID]
temp = [temp[2], temp[4:]]
spect_info = pd.DataFrame(refdata['spect'].loc[spectID]).T
spect_info.index.name = 'Spectrometer Identifier'
spect_info.reset_index(level=0, inplace=True)
file_info = pd.concat([file_info, spect_info], axis=1)
return file_info
def JSC(input_files, refdata):
try:
# read the first file
data = pd.read_csv(input_files[0], skiprows=14, sep='\t', engine='c')
data = data.rename(columns={data.columns[0]: 'time1', data.columns[1]: 'time2'})
metadata = pd.concat([jsc_filename_parse(input_files[0], refdata)] * len(data.index))
metadata.drop('spectrometer', axis=1, inplace=True)
# read the next files and merge them with the first
for file in input_files[1:]:
datatemp = pd.read_csv(file, skiprows=14, sep='\t', engine='c')
datatemp = datatemp.rename(columns={datatemp.columns[0]: 'time1', datatemp.columns[1]: 'time2'})
data = data.merge(datatemp)
time = data[['time1', 'time2']] # split the two time columns from the data frame
data.drop(['time1', 'time2'], axis=1, inplace=True) # trim the data frame so it is just the spectra
# make a multiindex for each wavlength column so they can be easily isolated from metadata later
data.columns = [['wvl'] * len(data.columns), np.array(data.columns.values, dtype='float').round(4)]
metadata.index = data.index
metadata = pd.concat([metadata, time], axis=1)
compcols = ['SiO2', 'TiO2', 'Al2O3', 'Cr2O3', 'Fe2O3T', 'MnO', 'MgO', 'CaO', 'Na2O', 'K2O', 'P2O5',
'SO3 LOI Residue', 'Total', 'Total Includes', '%LOI', 'FeO',
'Fe2O3', 'SO3 Actual', 'Fe(3+)/Fe(Total)', 'Rb (ug/g)', 'Sr (ug/g)', 'Y (ug/g)', 'Zr (ug/g)',
'V (ug/g)', 'Ni (ug/g)', 'Cr (ug/g)',
'Nb (ug/g)', 'Ga (ug/g)', 'Cu (ug/g)', 'Zn (ug/g)', 'Co (ug/g)', 'Ba (ug/g)', 'La (ug/g)',
'Ce (ug/g)', 'U (ug/g)', 'Th (ug/g)', 'Sc (ug/g)',
'Pb (ug/g)', 'Ge (ug/g)', 'As (ug/g)', 'Cl (ug/g)']
compdata = metadata[compcols]
metadata.drop(compcols, axis=1, inplace=True)
metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values]
compdata.columns = [['comp'] * len(compdata.columns), compdata.columns.values]
data = pd.concat([data, metadata, compdata], axis=1)
data[('meta', 'Scan #')] = data.index
data.set_index(('meta', 'time2'), drop=False, inplace=True)
return data
except:
print('Problem reading:' + input_file)
print('Moving to Problem_Files')
os.rename(input_file,
r"Problem_Files\\" + os.path.basename(
input_file))
return None
def jsc_batch(directory, LUT_files, searchstring='*.txt', to_csv=None):
# Read in the lookup tables to expand filename metadata
refdata = read_refdata(LUT_files)
# get the list of files that match the search string in the given directory
filelist = file_search(directory, searchstring)
spectIDs = [] # create an empty list to hold the spectrometer IDs
libsIDs = []
timestamps = []
locs = []
for file in filelist:
filesplit = os.path.basename(file).split('_')
spectIDs.append(filesplit[6]) # get the spectrometer IDs for each file in the list
libsIDs.append(filesplit[0])
timestamps.append(filesplit[-1].split('.')[0])
locs.append(filesplit[1])
spectIDs_unique = np.unique(spectIDs) # get the unique spectrometer IDs
libsIDs_unique = np.unique(libsIDs)
dfs = [] # create an empty list to hold the data frames for each spectrometer
# loop through each LIBS ID
alldata = []
for ID in libsIDs_unique:
print('Working on : ' + str(ID))
sublist = filelist[np.in1d(libsIDs, ID)]
locs = []
for file in sublist:
locs.append(os.path.basename(file).split('_')[1])
locs_unique = np.unique(locs)
# loop through each location for that libs ID
for loc in locs_unique:
print(loc)
sub_sublist = sublist[np.in1d(locs, loc)] # get the files for that LIBSID and location
data = JSC(sub_sublist, refdata)
alldata.append(data)
pass
combined = pd.concat(alldata)
if to_csv is not None:
print('Writing combined data to: ' + to_csv)
combined.to_csv(to_csv)
return combined
# got this function from stack overflow: http://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
# it's slow but doesn't crash python like combined.T.drop_duplicates().T does in some cases with very large sets of data
def duplicate_columns(frame):
groups = frame.columns.to_series().groupby(frame.dtypes).groups
dups = []
for t, v in groups.items():
cs = frame[v].columns
vs = frame[v]
lcs = len(cs)
for i in range(lcs):
ia = vs.iloc[:, i].values
for j in range(i + 1, lcs):
ja = vs.iloc[:, j].values
if array_equivalent(ia, ja):
dups.append(cs[i])
break
return dups
import numpy as np
from osgeo import gdal
def openm3(input_data):
if input_data.split('.')[-1] == 'hdr':
# GDAL wants the img, but many users aim at the .hdr
input_data = input_data.split('.')[0] + '.img'
ds = gdal.Open(input_data)
ref_array = ds.GetRasterBand(1).ReadAsArray()
metadata = ds.GetMetadata()
wv_array = metadatatoband(metadata)
return wv_array, ref_array, ds
def metadatatoband(metadata):
wv2band = []
for k, v in metadata.iteritems():
try:
wv2band.append(float(value))
except:
v = v.split(" ")[-1].split("(")[1].split(")")[0]
wv2band.append(float(v))
wv2band.sort(key=int)
return np.asarray(wv2band)
import numpy as np
from osgeo import gdal
def openmi(input_data):
ds = gdal.Open(input_data)
band_pointers = []
nbands = ds.RasterCount
for b in xrange(1, nbands + 1):
band_pointers.append(ds.GetRasterBand(b))
ref_array = ds.GetRasterBand(1).ReadAsArray()
wv_array = None
return wv_array, ref_array[::3, ::3], ds
def getspectra(x, y, ds):
nbands = ds.RasterCount
reflectance = np.empty(nbands)
for b in range(1, nbands + 1):
reflectance[b - 1] = ds.GetRasterBand(b).ReadAsArray(y, x, 1, 1)
mergedref = np.empty(nbands - 1)
mergedref[:4] = reflectance[:4]
mergedref[4] = (reflectance[4] + reflectance[5]) / 2
mergedref[5:] = reflectance[6:]
return mergedref
try:
import yaml
except:
print('YAML package not installed, disabling yaml_io module')
import yaml
def read_yaml(inputfile):
......@@ -21,6 +18,6 @@ def read_yaml(inputfile):
try:
with open(inputfile, 'r') as f:
ydict = yaml.load(f)
except: # pragma: no cover
except: # pragma: no cover
raise IOError('Unable to load YAML file.')
return ydict
import os
import sys
import unittest
sys.path.insert(0, os.path.abspath('..'))
from plio.examples import get_path
from plio.io import io_ccam_pds
class Test_CCAM_IO(unittest.TestCase):
def setUp(self):
self.examplefile = get_path('CL5_398645626CCS_F0030004CCAM02013P3.csv')
def test_14_item_header_csv(self):
io_ccam_pds.CCAM_CSV(self.examplefile)
if __name__ == '__main__':
unittest.main()
......@@ -6,8 +6,8 @@ from time import strftime, gmtime
import pandas as pd
import pvl
from .. import io_controlnetwork
from .. import ControlNetFileV0002_pb2 as cnf
from plio.io import io_controlnetwork
from plio.io import ControlNetFileV0002_pb2 as cnf
from plio.utils.utils import find_in_dict
sys.path.insert(0, os.path.abspath('..'))
......
import unittest
from plio.examples import get_path
from plio.io import io_edr
# class Test_Tes_IO(unittest.TestCase):
#
# # Need different test data or need to modify the current code
# def setUp(self):
# self.examplefile = get_path('cl5_398736801edr_f0030004ccam01014m1.dat')
# #
# def test_open(self):
# ds = io_edr.EDR(self.examplefile)
#
# if __name__ == '__main__':
# unittest.main()
import unittest
from .. import io_json
from .. import io_yaml
from plio.io import io_json
from plio.io import io_yaml
try:
import yaml
......
......@@ -5,6 +5,7 @@ import os
import fnmatch
import shutil
import tempfile
import pandas as pd
def create_dir(basedir=''):
......@@ -50,30 +51,6 @@ def file_to_list(file):
return list(file_list)
def create_dir(basedir=''):
"""
Create a unique, temporary directory in /tmp where processing will occur
Parameters
----------
basedir : str
The PATH to create the temporary directory in.
"""
return tempfile.mkdtemp(dir=basedir)
def delete_dir(dir):
"""
Delete a directory
Parameters
----------
dir : str
Remove a directory
"""
shutil.rmtree(dir)
def file_search(searchdir,searchstring):
"""
Recursively search for files in the specified directory
......@@ -164,3 +141,31 @@ def xstr(s):
if s is None:
return ''
return str(s)
def lookup(df, lookupfile=None, lookupdf=None, sep=',', skiprows=1, left_on='sclock', right_on='Spacecraft Clock'):
#TODO: automatically determine the number of rows to skip to handle ccam internal master list and PDS "official" master list formats
if lookupfile is not None:
# this loop concatenates together multiple lookup files if provided
# (mostly to handle the three different master lists for chemcam)
for x in lookupfile:
try:
tmp = pd.read_csv(x, sep=sep, skiprows=skiprows, error_bad_lines=False)
lookupdf = pd.concat([lookupdf, tmp])
except:
lookupdf = pd.read_csv(x, sep=sep, skiprows=skiprows, error_bad_lines=False)
metadata = df['meta']
metadata = metadata.merge(lookupdf, left_on=left_on, right_on=right_on, how='left')
# remove metadata columns that already exist in the data frame to avoid non-unique columns
meta_cols = set(metadata.columns.values)
meta_cols_keep = list(meta_cols - set(df['meta'].columns.values))
metadata = metadata[meta_cols_keep]
# make metadata into a multiindex
metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values]
# give it the same indices as the df
metadata.index = df.index
# combine the df and the new metadata
df = pd.concat([metadata, df], axis=1)
return df
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment