Skip to content
Snippets Groups Projects
Commit 7f0be77f authored by Adam Paquette's avatar Adam Paquette
Browse files

Moved io_jsc from PySAT into plio

parent 3f9640d6
No related branches found
No related tags found
No related merge requests found
import os
import numpy as np
import pandas as pd
from pandas.core.common import array_equivalent
from plio.utils.utils import file_search
# This function reads the lookup tables used to expand metadata from the file names
# This is separated from parsing the filenames so that for large lists of files the
# lookup tables don't need to be read over and over
#
# Info in the tables is stored in a dict of dataframes so that only one variable
# (the dict) needs to be passed between functions
def read_refdata(LUT_files):
ID_info = pd.read_csv(LUT_files['ID'], index_col=0)
spectrometer_info = pd.read_csv(LUT_files['spect'], index_col=0)
# spectrometer_info.reset_index(inplace=True)
laser_info = pd.read_csv(LUT_files['laser'], index_col=0)
# laser_info.reset_index(inplace=True)
exp_info = pd.read_csv(LUT_files['exp'], index_col=0)
# exp_info.reset_index(inplace=True)
sample_info = pd.read_csv(LUT_files['sample'], index_col=0)
# sample_info.reset_index(inplace=True)
refdata = {'spect': spectrometer_info, 'laser': laser_info, 'exp': exp_info, 'sample': sample_info, 'ID': ID_info}
return refdata
# This function parses the file names to record metadata related to the observation
def jsc_filename_parse(filename, refdata):
filename = os.path.basename(filename) # strip the path off of the file name
filename = filename.split('_') # split the file name on underscores
libs_ID = filename[0]
laserID = filename[4][0]
expID = filename[5]
spectID = filename[6]
try:
sampleID = refdata['ID'].loc[libs_ID].values[0]
file_info = pd.DataFrame(refdata['sample'].loc[sampleID])
if file_info.columns.shape[0] < file_info.index.shape[0]:
file_info = file_info.T
if file_info.index.shape[0] > 1:
print('More than one matching row for ' + sampleID + '!')
tempID = 'Unknown'
file_info = pd.DataFrame(refdata['sample'].loc[tempID])
if file_info.columns.shape[0] < file_info.index.shape[0]:
file_info = file_info.T
except:
sampleID = 'Unknown'
file_info = pd.DataFrame(refdata['sample'].loc[sampleID])
if file_info.columns.shape[0] < file_info.index.shape[0]:
file_info = file_info.T
file_info['Sample ID'] = sampleID
file_info['LIBS ID'] = libs_ID
file_info.reset_index(level=0, inplace=True, drop=True)
file_info['loc'] = int(filename[1])
file_info['lab'] = filename[2]
file_info['gas'] = filename[3][0]
file_info['pressure'] = float(filename[3][1:])
if laserID in refdata['laser'].index:
laser_info = pd.DataFrame(refdata['laser'].loc[laserID]).T
laser_info.index.name = 'Laser Identifier'
laser_info.reset_index(level=0, inplace=True)
file_info = pd.concat([file_info, laser_info], axis=1)
file_info['laser_power'] = float(filename[4][1:])
if expID in refdata['exp'].index:
exp_info = pd.DataFrame(refdata['exp'].loc[expID]).T
exp_info.index.name = 'Exp Identifier'
exp_info.reset_index(level=0, inplace=True)
file_info = pd.concat([file_info, exp_info], axis=1)
file_info['spectrometer'] = spectID
if spectID in refdata['spect'].index:
temp = refdata['spect'].loc[spectID]
temp = [temp[2], temp[4:]]
spect_info = pd.DataFrame(refdata['spect'].loc[spectID]).T
spect_info.index.name = 'Spectrometer Identifier'
spect_info.reset_index(level=0, inplace=True)
file_info = pd.concat([file_info, spect_info], axis=1)
return file_info
def JSC(input_files, refdata):
try:
# read the first file
data = pd.read_csv(input_files[0], skiprows=14, sep='\t', engine='c')
data = data.rename(columns={data.columns[0]: 'time1', data.columns[1]: 'time2'})
metadata = pd.concat([jsc_filename_parse(input_files[0], refdata)] * len(data.index))
metadata.drop('spectrometer', axis=1, inplace=True)
# read the next files and merge them with the first
for file in input_files[1:]:
datatemp = pd.read_csv(file, skiprows=14, sep='\t', engine='c')
datatemp = datatemp.rename(columns={datatemp.columns[0]: 'time1', datatemp.columns[1]: 'time2'})
data = data.merge(datatemp)
time = data[['time1', 'time2']] # split the two time columns from the data frame
data.drop(['time1', 'time2'], axis=1, inplace=True) # trim the data frame so it is just the spectra
# make a multiindex for each wavlength column so they can be easily isolated from metadata later
data.columns = [['wvl'] * len(data.columns), np.array(data.columns.values, dtype='float').round(4)]
metadata.index = data.index
metadata = pd.concat([metadata, time], axis=1)
compcols = ['SiO2', 'TiO2', 'Al2O3', 'Cr2O3', 'Fe2O3T', 'MnO', 'MgO', 'CaO', 'Na2O', 'K2O', 'P2O5',
'SO3 LOI Residue', 'Total', 'Total Includes', '%LOI', 'FeO',
'Fe2O3', 'SO3 Actual', 'Fe(3+)/Fe(Total)', 'Rb (ug/g)', 'Sr (ug/g)', 'Y (ug/g)', 'Zr (ug/g)',
'V (ug/g)', 'Ni (ug/g)', 'Cr (ug/g)',
'Nb (ug/g)', 'Ga (ug/g)', 'Cu (ug/g)', 'Zn (ug/g)', 'Co (ug/g)', 'Ba (ug/g)', 'La (ug/g)',
'Ce (ug/g)', 'U (ug/g)', 'Th (ug/g)', 'Sc (ug/g)',
'Pb (ug/g)', 'Ge (ug/g)', 'As (ug/g)', 'Cl (ug/g)']
compdata = metadata[compcols]
metadata.drop(compcols, axis=1, inplace=True)
metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values]
compdata.columns = [['comp'] * len(compdata.columns), compdata.columns.values]
data = pd.concat([data, metadata, compdata], axis=1)
data[('meta', 'Scan #')] = data.index
data.set_index(('meta', 'time2'), drop=False, inplace=True)
return data
except:
print('Problem reading:' + input_file)
print('Moving to Problem_Files')
os.rename(input_file,
r"Problem_Files\\" + os.path.basename(
input_file))
return None
def jsc_batch(directory, LUT_files, searchstring='*.txt', to_csv=None):
# Read in the lookup tables to expand filename metadata
refdata = read_refdata(LUT_files)
# get the list of files that match the search string in the given directory
filelist = file_search(directory, searchstring)
spectIDs = [] # create an empty list to hold the spectrometer IDs
libsIDs = []
timestamps = []
locs = []
for file in filelist:
filesplit = os.path.basename(file).split('_')
spectIDs.append(filesplit[6]) # get the spectrometer IDs for each file in the list
libsIDs.append(filesplit[0])
timestamps.append(filesplit[-1].split('.')[0])
locs.append(filesplit[1])
spectIDs_unique = np.unique(spectIDs) # get the unique spectrometer IDs
libsIDs_unique = np.unique(libsIDs)
dfs = [] # create an empty list to hold the data frames for each spectrometer
# loop through each LIBS ID
alldata = []
for ID in libsIDs_unique:
print('Working on : ' + str(ID))
sublist = filelist[np.in1d(libsIDs, ID)]
locs = []
for file in sublist:
locs.append(os.path.basename(file).split('_')[1])
locs_unique = np.unique(locs)
# loop through each location for that libs ID
for loc in locs_unique:
print(loc)
sub_sublist = sublist[np.in1d(locs, loc)] # get the files for that LIBSID and location
data = JSC(sub_sublist, refdata)
alldata.append(data)
pass
combined = pd.concat(alldata)
if to_csv is not None:
print('Writing combined data to: ' + to_csv)
combined.to_csv(to_csv)
return combined
# got this function from stack overflow: http://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
# it's slow but doesn't crash python like combined.T.drop_duplicates().T does in some cases with very large sets of data
def duplicate_columns(frame):
groups = frame.columns.to_series().groupby(frame.dtypes).groups
dups = []
for t, v in groups.items():
cs = frame[v].columns
vs = frame[v]
lcs = len(cs)
for i in range(lcs):
ia = vs.iloc[:, i].values
for j in range(i + 1, lcs):
ja = vs.iloc[:, j].values
if array_equivalent(ia, ja):
dups.append(cs[i])
break
return dups
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment