From c0ee79504f8207508d51aa2c017bf3863541c8d0 Mon Sep 17 00:00:00 2001 From: Adam Paquette <acp263@nau.edu> Date: Wed, 18 Apr 2018 12:26:09 -0700 Subject: [PATCH] Initial addition of ipf reader. --- plio/io/io_bae.py | 145 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 135 insertions(+), 10 deletions(-) diff --git a/plio/io/io_bae.py b/plio/io/io_bae.py index e0c3d4f..9fd4583 100644 --- a/plio/io/io_bae.py +++ b/plio/io/io_bae.py @@ -46,12 +46,124 @@ def socetset_keywords_to_json(keywords, ell=None): # Case where the values are on a newline after the key nums = numeric_matcher.findall(l) stream[key] += map(float, nums) - + parse(keywords) if ell: parse(ell) return json.dumps(stream) - + +def read_ipfs(input_data_list): + """ + Read a socet ipf file into a pandas data frame + + Parameters + ---------- + input_data_list : list + list of paths to the a set of input data files + + Returns + ------- + df : pd.DataFrame + containing the ipf data with appropriate column names and indices + """ + + default_columns = np.genfromtxt(input_data_list[0], skip_header=2, dtype='unicode', + max_rows = 1, delimiter = ',') + + columns = [] + + for column in default_columns: + + if '(' in column and ')' in column: + column_name ,suffix = column.split('(') + num = int(suffix.split(')')[0]) + + for column_num in range(int(num)): + new_column = '{}{}'.format(column_name, column_num) + columns.append(new_column); + + else: + columns.append(column) + + d_total = [] + + for input_file in input_data_list: + d = read_ipf(input_file) + for point in d: + d_total.append(point) + + df = pd.DataFrame(d_total, columns=columns) + + # Soft conversion of numeric types to numerics, allows str in first col for point_id + df = df.apply(pd.to_numeric, errors='ignore') + + # Validate the read data with the header point count + # assert int(cnt) == len(df), 'Dataframe length {} does not match point length {}.'.format(int(cnt), len(df)) + + return df + +def read_ipf(input_data): + """ + Read a socet ipf file into a pandas data frame + + Parameters + ---------- + input_data : str + path to the an input data file + + Returns + ------- + df : pd.DataFrame + containing the ipf data with appropriate column names and indices + """ + + # Check that the number of rows is matching the expected number + with open(input_data, 'r') as f: + for i, l in enumerate(f): + if i == 1: + cnt = int(l) + elif i == 2: + col = l + break + + # default_columns = np.genfromtxt(input_data, skip_header=2, dtype='unicode', + # max_rows = 1, delimiter = ',') + # + # columns = [] + # + # for column in default_columns: + # + # if '(' in column and ')' in column: + # column_name ,suffix = column.split('(') + # num = int(suffix.split(')')[0]) + # + # for column_num in range(int(num)): + # new_column = '{}{}'.format(column_name, column_num) + # columns.append(new_column); + # + # else: + # columns.append(column) + + # TODO: Add unicode conversion + + d = [line.split() for line in open(input_data, 'r')] + d = np.hstack(np.array(d[3:])) + d = d.reshape(-1, 12) + + assert int(cnt) == len(d), 'Dataframe length {} does not match point length {}.'.format(int(cnt), len(df)) + + return d + + # df = pd.DataFrame(d, columns=columns) + # + # # Soft conversion of numeric types to numerics, allows str in first col for point_id + # df = df.apply(pd.to_numeric, errors='ignore') + # + # # Validate the read data with the header point count + # assert int(cnt) == len(df), 'Dataframe length {} does not match point length {}.'.format(int(cnt), len(df)) + # + # return df + def read_gpf(input_data): """ Read a socet gpf file into a pandas data frame @@ -76,22 +188,35 @@ def read_gpf(input_data): col = l break + default_columns = np.genfromtxt(input_data, skip_header=2, dtype='unicode', + max_rows = 1, delimiter = ',') + + columns = [] + + for column in default_columns: + + if '(' in column and ')' in column: + column_name ,suffix = column.split('(') + num = int(suffix.split(')')[0]) + + for column_num in range(int(num)): + new_column = '{}{}'.format(column_name, column_num) + columns.append(new_column); + + else: + columns.append(column) + # Mixed types requires read as unicode - let pandas soft convert d = np.genfromtxt(input_data, skip_header=3, dtype='unicode') d = d.reshape(-1, 12) - #TODO: cols should be used to dynamically generate the column names - - df = pd.DataFrame(d, columns=['point_id', 'stat', 'known', - 'lat_Y_North', 'long_X_East','ht', - 'sigma0', 'sigma1', 'sigma2', - 'res0', 'res1', 'res2']) + df = pd.DataFrame(d, columns=columns) # Soft conversion of numeric types to numerics, allows str in first col for point_id df = df.apply(pd.to_numeric, errors='ignore') # Validate the read data with the header point count - assert int(cnt) == len(df) + assert int(cnt) == len(df), 'Dataframe length {} does not match point length {}.'.format(int(cnt), len(df)) return df @@ -136,4 +261,4 @@ def save_gpf(df, output_file): outGPF.write('{0} {1} {2}\n\n'.format(row['res0'], row['res1'], row['res2'])) outGPF.close() - return \ No newline at end of file + return -- GitLab