Compare revisions

dab489de · dab489de · dab489de · dab489de · dab489de · dab489de
--- a/etl/prs/prepare_inference_input.py
+++ b/etl/prs/prepare_inference_input.py
+import pandas as pd
+import yaml
+import os
+
+df = pd.read_csv(os.path.join('assets', 'data', 'ch3oh_data_top35.csv'))
+
+config_dict = {
+    'simulated_ratio_realizations': 1000,
+    'recompute_kde': True,
+    'probability_threshold': 0.37,
+    'best_bandwidths': {
+        '87-86': 0.1,
+        '88-86': 0.1,
+        '88-87': 0.1,
+        '257-256': 0.4,
+        '381-380': 0.2,
+    },
+    'ratio_limits': {
+        '87-86': [0, 1.7],
+        '88-86': [0, 1.7],
+        '88-87': [0, 1.7],
+        '257-256': [0, 15],
+        '381-380': [0, 4],
+    },
+    'use_model_for_inference': 'PLACEHOLDER',
+    'ratios_to_include': [
+        "87-86",
+        "88-86",
+        "88-87",
+        "257-256",
+        "381-380"
+    ]
+}
+
+intensities_dict = {}
+uncertainties_dict = {}
+
+data_dict = df.to_dict(orient='records')
+for line_id in ['86', '87', '88', '257', '256', '381', '380']:
+    intensities_dict[line_id] = {}
+    uncertainties_dict[line_id] = {}
+    for row in data_dict:
+        intensities_dict[line_id][str(row['source_name'])] = row[f'area_{line_id}']
+        uncertainties_dict[line_id][str(row['source_name'])] = row[f'area_unc_{line_id}']
+config_dict['measured_integrated_intensities'] = intensities_dict
+config_dict['integrated_intensities_uncertainties'] = uncertainties_dict
+
+with open('config/density_inference_input.yml', 'w') as outfile:
+    yaml.dump(config_dict, outfile)
--- a/etl/prs/prs_analytical_representations.py
+++ b/etl/prs/prs_analytical_representations.py
+import numpy as np
+import os
+import matplotlib.pyplot as plt
+import pickle
+from scipy.interpolate import splrep, BSpline, RegularGridInterpolator
+from scipy.optimize import curve_fit
+from assets.commons import (get_data,
+                            load_config_file,
+                            setup_logger,
+                            validate_parameter)
+from typing import Union, List
+
+
+def approx(x, x_shift, a, b, exponent, scale=1):
+    return (np.abs(np.arctanh(2 * (x - x_shift) / scale) + a)) ** exponent + b
+
+
+def main(ratios_to_fit: Union[List[str], None] = None):
+    config = load_config_file(config_file_path=os.path.join('config', 'config.yml'))
+    _ratio_to_fit = validate_parameter(
+        param_to_validate=ratios_to_fit,
+        default=['-'.join(lines_ratio) for lines_ratio in config['overrides']['lines_to_process']])
+    data = get_data(limit_rows=None,
+                    use_model_for_inference=config['run_type'])
+    logger.info(f'Using {config["run_type"]} to fit data...')
+    logger.info(f'Data contains {data.shape[0]} rows')
+
+    bw = {
+        '87-86': 0.1,
+        '88-87': 0.1,
+        '88-86': 0.1,
+        '257-256': 0.4,
+        '381-380': 0.2}
+
+    x0 = {
+        '87-86': ((0.55, 2.3, 3.75, 0.9),),
+        '88-87': ((0.59, 2.3, 3.9, 1.05, 0.96),),
+        '88-86': ((0.55, 2.5, 4.3, 0.83, 1.08),),
+        '257-256': ((5, 2.5, 2.45, 0.75, 8), (5, 2.5, 3.33, 0.75, -8)),
+        '381-380': ((2.04, 2.5, 3.1, 0.6, 2.1), (2.18, 2.5, 3.7, 0.79, -2.4)),
+    }
+    min_ratio = {
+        '87-86': 0.08,
+        '88-87': 0.08,
+        '88-86': 0.03,
+        '257-256': 1,
+        '381-380': 1}
+    max_ratio = {
+        '87-86': 1.04,
+        '88-87': 1.02,
+        '88-86': 1.07,
+        '257-256': 7.5,
+        '381-380': 2.9}
+
+    for ratio_string in _ratio_to_fit:
+        _column_to_fit = f'ratio_{ratio_string}'
+        data.sort_values(by=_column_to_fit, inplace=True)
+        plt.clf()
+        plt.scatter(data[_column_to_fit], data['avg_nh2'], marker='+', alpha=0.1)
+        x_reg = np.linspace(min_ratio[ratio_string], max_ratio[ratio_string], 100)
+
+        data_clean = data[[_column_to_fit, 'avg_nh2']].loc[data[_column_to_fit] <= max_ratio[ratio_string]].copy()
+        kde = pickle.load(
+            open(f'prs/output/run_type/{config["run_type"]}/trained_model/ratio_density_kde_{ratio_string}.pickle',
+                 'rb'))
+        rgi = RegularGridInterpolator((kde['y'] * bw[ratio_string], kde['x'] * 0.2), kde['values'].reshape(200, 200).T)
+        density = rgi(data_clean[[_column_to_fit, 'avg_nh2']])
+        data_clean = data_clean.loc[(density > density.mean() - density.std())]
+        plt.scatter(data_clean[_column_to_fit], data_clean['avg_nh2'], marker='+', alpha=0.1, color='red')
+        for components in x0[ratio_string]:
+            plt.plot(x_reg, approx(x_reg, *components), color='cyan')
+        plt.show()
+
+
+if __name__ == '__main__':
+    logger = setup_logger(name='POLYFIT')
+    main()
--- a/etl/prs/prs_check_biases_poc_sample.py
+++ b/etl/prs/prs_check_biases_poc_sample.py
+from assets.utils import get_poc_results
+from scipy.stats import anderson_ksamp
+from assets.commons import setup_logger
+import numpy as np
+
+
+logger = setup_logger(name='PRS - AD test')
+
+df_full = get_poc_results(line_fit_filename='ch3oh_data_top35.csv')
+
+df_full['best_fit'] /= 1e5
+df_full['best_fit'] = df_full['best_fit'].round(1)
+df_full['mass'] = (df_full['mass'] / 100).round(1)
+df_full['distance'] = (df_full['distance']).round(1)
+df_full['diameter'] = (df_full['diameter']).round(1)
+df_full = df_full[(df_full['mass'] < 100) & (df_full['mass'] > 3)]
+# df_full = df_full[df_full['class'] != 'HII']
+# df_full = df_full[df_full['class'].isin(['IRb', 'IRw'])]
+
+ad_results = {}
+for column in ['mass', 'distance', 'best_fit', 'diameter']:
+    array_list = [d[column].values for _, d in df_full.groupby(['class'])]
+    ad_results[column] = anderson_ksamp(array_list)[2]
+
+logger.info(f'The p values of the AD test are: {ad_results}')
--- a/etl/prs/prs_compute_integrated_fluxes_and_ratios.py
+++ b/etl/prs/prs_compute_integrated_fluxes_and_ratios.py
+import os
+import sqlalchemy
+import numpy as np
+import astropy.units as u
+from datetime import datetime
+from astropy.io import fits
+from astropy import constants
+from typing import Union, List
+from assets.commons import (validate_parameter,
+                            get_value_if_specified,
+                            load_config_file,
+                            setup_logger,
+                            compute_unique_hash_filename)
+from assets.commons.db_utils import upsert, get_pg_engine
+from assets.constants import aggregation_function_mapping
+from stg.stg_build_db_structure import (MomentZeroMaps,
+                                        RatioMaps)
+
+logger = setup_logger(name='PRS')
+
+
+def populate_mom_zero_table(config_prs: dict,
+                            fits_cube_name: str,
+                            moment_zero_map_fits: str,
+                            aggregated_moment_zero: float,
+                            executed_on: datetime.timestamp,
+                            engine: sqlalchemy.engine,
+                            run_id: str):
+    """
+    Insert record in DB for the moment zero table
+    :param config_prs: configuration of the presentation layer
+    :param fits_cube_name: name of the fits cube
+    :param moment_zero_map_fits: filename of the moment zero map
+    :param aggregated_moment_zero: the aggregated value of the moment zero
+    :param executed_on: timestamp of execution
+    :param engine: SQLAlchemy engine to use
+    :param run_id: the run unique identifier
+    """
+    _integration_limit_low = get_value_if_specified(config_prs, 'integration_limits')[0] if get_value_if_specified(
+        config_prs, 'integration_limits') != 'all' else None
+    _integration_limit_high = get_value_if_specified(config_prs, 'integration_limits')[1] if get_value_if_specified(
+        config_prs, 'integration_limits') != 'all' else None
+    moment_zero_dict = {
+        'mom_zero_name': f'{moment_zero_map_fits}',
+        'fits_cube_name': f'{fits_cube_name}',
+        'integration_limit_low': _integration_limit_low,
+        'integration_limit_high': _integration_limit_high,
+        'aggregated_moment_zero': aggregated_moment_zero,
+        'aggregation_function': config_prs['moment_zero_aggregation_function'],
+        'created_on': executed_on,
+        'run_id': run_id,
+    }
+    upsert(
+        table_object=MomentZeroMaps,
+        row_dict=moment_zero_dict,
+        conflict_keys=[MomentZeroMaps.mom_zero_name,
+                       MomentZeroMaps.run_id],
+        engine=engine
+    )
+
+
+def populate_line_ratios_table(config_prs: dict,
+                               moment_zero_map_fits_list: List[str],
+                               ratio_map_name: str,
+                               aggregated_ratio: float,
+                               engine: sqlalchemy.engine,
+                               executed_on: datetime.timestamp,
+                               run_id: str):
+    """
+    Insert record in DB for the line ratios table
+    :param config_prs: configuration of the presentation layer
+    :param moment_zero_map_fits_list: list of the moment 0 maps to use to compute the ratio
+    :param ratio_map_name: filename of the ratio map
+    :param aggregated_ratio: aggregated value of the ratio
+    :param engine: SQLAlchemy engine to use
+    :param executed_on: timestamp of execution
+    :param run_id: the run unique identifier
+    """
+    _integration_limit_low = get_value_if_specified(config_prs, 'integration_limits')[0] if get_value_if_specified(
+        config_prs, 'integration_limits') != 'all' else None
+    _integration_limit_high = get_value_if_specified(config_prs, 'integration_limits')[1] if get_value_if_specified(
+        config_prs, 'integration_limits') != 'all' else None
+    assert len(moment_zero_map_fits_list) == 2
+    line_ratio_dict = {
+        'ratio_map_name': f'{ratio_map_name}',
+        'mom_zero_name_1': f'{moment_zero_map_fits_list[0]}',
+        'mom_zero_name_2': f'{moment_zero_map_fits_list[1]}',
+        'aggregated_ratio': aggregated_ratio,
+        'aggregation_function': config_prs['aggregation_function'],
+        'created_on': executed_on,
+        'run_id': run_id,
+    }
+    upsert(
+        table_object=RatioMaps,
+        row_dict=line_ratio_dict,
+        conflict_keys=[RatioMaps.ratio_map_name,
+                       RatioMaps.run_id],
+        engine=engine
+    )
+
+
+def compute_moment_zero(cube: str,
+                        config: dict,
+                        cube_path: Union[str, None] = None,
+                        moment_zero_path: Union[str, None] = None,
+                        moment_zero_fits_name: Union[str, None] = None,
+                        hdu_idx: int = 0) -> float:
+    """
+    Compute moment zero map, given a cube
+    :param cube: filename of the fits cube
+    :param config: configuration of the presentation layer
+    :param cube_path: path where the fits cube is to be found
+    :param moment_zero_path: path where the moment 0 map is to be saved
+    :param moment_zero_fits_name: filename for the moment zero map
+    :param hdu_idx: index of the HDU to extract the data and the header
+    :return: the aggregated value of the moment zero, according to the function specified in the configuration
+    """
+    _cube_path = validate_parameter(cube_path, default=os.path.join('prs', 'fits', 'cubes'))
+    _moment_zero_path = validate_parameter(moment_zero_path, default=os.path.join('prs', 'fits', 'moments'))
+    _moment_zero_fits_name = validate_parameter(moment_zero_fits_name, default='test_mom0.fits')
+
+    fitsfile = open_fits_file_duck_typing(fitsfile=cube, fits_path=_cube_path)
+    header = fitsfile[hdu_idx].header.copy()
+    data = fitsfile[hdu_idx].data
+    conversion = header['CUNIT3'].strip() == 'HZ'
+    spectral_unit = 'km/s' if conversion is True else 'Hz'
+    conversion_factor = 1 if conversion is False else (-constants.c * header['CDELT3'] / header['CRVAL3']).to('km/s').value
+    mom0 = (data.sum(axis=0) * abs(conversion_factor))
+    keywords_to_delete = ['NAXIS3', 'CRPIX3', 'CDELT3', 'CRVAL3', 'CUNIT3', 'CTYPE3']
+    header['BUNIT'] += f' {spectral_unit}'
+    header['BTYPE'] = 'MOMENT0'
+    header['NAXIS'] = 2
+    for key in keywords_to_delete:
+        del header[key]
+    fits.writeto(os.path.join(_moment_zero_path, _moment_zero_fits_name), data=mom0, header=header, overwrite=True)
+    try:
+        return aggregation_function_mapping[config['moment_zero_aggregation_function']](mom0)
+    except KeyError:
+        logger.warning(
+            'Moment zero aggregation function not set or misconfigured. Trying to use the one for the ratio...')
+        return aggregation_function_mapping[config['aggregation_function']](mom0)
+
+
+def open_fits_file_duck_typing(fitsfile: Union[str, fits.PrimaryHDU],
+                               fits_path: str = None) -> fits.PrimaryHDU:
+    _fits_path = validate_parameter(fits_path, default='.')
+    try:
+        hdu = fits.open(os.path.join(_fits_path, fitsfile))
+    except TypeError:
+        hdu = fitsfile
+    return hdu
+
+
+def compute_image_ratios(fits1: str,
+                         fits2: str,
+                         config: dict,
+                         fits_path: Union[str, None] = None,
+                         ratio_fits_path: Union[str, None] = None,
+                         ratio_fits_name: Union[str, None] = None,
+                         hdu1_idx: int = 0,
+                         hdu2_idx: int = 0):
+    """
+    Compute the ratio of the images and save it to fits. The ratio is computed as fits1/fits2
+    :param fits1: filename of the first moment 0 map
+    :param fits2: filename of the second moment 0 map
+    :param config: configuration of the presentation layer
+    :param fits_path: path where the fits files are to be found
+    :param ratio_fits_path: path where the ratio map is to be saved
+    :param ratio_fits_name: filename for the ratio map
+    :param hdu1_idx: index of the HDU to extract the data and the header, for the first fits file
+    :param hdu2_idx: index of the HDU to extract the data and the header, for the second fits file
+    :return:
+    """
+    _fits_path = validate_parameter(fits_path, default=os.path.join('prs', 'fits', 'moments'))
+    _ratio_fits_path = validate_parameter(ratio_fits_path, default=os.path.join('prs', 'fits', 'ratios'))
+    _ratio_fits_name = validate_parameter(ratio_fits_name, default='ratio.fits')
+    hdu1 = open_fits_file_duck_typing(fitsfile=fits1,
+                                      fits_path=_fits_path)
+    hdu2 = open_fits_file_duck_typing(fitsfile=fits2,
+                                      fits_path=_fits_path)
+    ratio_image_data = hdu1[hdu1_idx].data / hdu2[hdu2_idx].data
+    if config['aggregation_function'] in ('mean', 'sum'):
+        aggregated_ratio = aggregation_function_mapping[config['aggregation_function']](ratio_image_data)
+    elif config['aggregation_function'] == 'weighted_mean':
+        aggregated_ratio = np.nansum(hdu1[hdu1_idx].data) / np.nansum(hdu2[hdu2_idx].data)
+    else:
+        raise NotImplementedError('Aggregation function not configured')
+    output_header = hdu1[hdu1_idx].header
+    output_header['BUNIT'] = ''
+    output_header['BTYPE'] = 'INT.RATIO'
+    fits.writeto(os.path.join(_ratio_fits_path, _ratio_fits_name),
+                 data=ratio_image_data,
+                 header=output_header,
+                 overwrite=True)
+    return aggregated_ratio
+
+
+def main(cube_fits_list: List[str],
+         run_id: str,
+         mom0_out_cube1: Union[str, None] = None,
+         mom0_out_cube2: Union[str, None] = None,
+         engine: sqlalchemy.engine = None) -> str:
+    assert len(cube_fits_list) == 2
+    _mom0_out_cube1 = validate_parameter(mom0_out_cube1, default=cube_fits_list[0].replace('.fits', '_mom0.fits'))
+    _mom0_out_cube2 = validate_parameter(mom0_out_cube2, default=cube_fits_list[1].replace('.fits', '_mom0.fits'))
+    config_prs = load_config_file(os.path.join('prs', 'config', 'config.yml'))['flux_computation']
+    executed_on = datetime.now()
+    if engine is None:
+        engine = get_pg_engine(logger=logger)
+    config_prs.update({
+        'cube_fits_list': cube_fits_list,
+        'mom0_image_list': [_mom0_out_cube1, _mom0_out_cube2]
+    })
+    ratio_fits_name = f'{compute_unique_hash_filename(config=config_prs)}.fits'
+    aggregated_mom0_1 = compute_moment_zero(cube=cube_fits_list[0],
+                                            config=config_prs,
+                                            moment_zero_fits_name=_mom0_out_cube1)
+    aggregated_mom0_2 = compute_moment_zero(cube=cube_fits_list[1],
+                                            config=config_prs,
+                                            moment_zero_fits_name=_mom0_out_cube2)
+    aggregated_image_ratio = compute_image_ratios(fits1=_mom0_out_cube1,
+                                                  fits2=_mom0_out_cube2,
+                                                  config=config_prs,
+                                                  ratio_fits_name=ratio_fits_name)
+    populate_mom_zero_table(config_prs=config_prs,
+                            fits_cube_name=cube_fits_list[0],
+                            moment_zero_map_fits=_mom0_out_cube1,
+                            aggregated_moment_zero=aggregated_mom0_1,
+                            executed_on=executed_on,
+                            engine=engine,
+                            run_id=run_id)
+    populate_mom_zero_table(config_prs=config_prs,
+                            fits_cube_name=cube_fits_list[1],
+                            aggregated_moment_zero=aggregated_mom0_2,
+                            moment_zero_map_fits=_mom0_out_cube2,
+                            executed_on=executed_on,
+                            engine=engine,
+                            run_id=run_id)
+    populate_line_ratios_table(config_prs=config_prs,
+                               moment_zero_map_fits_list=[_mom0_out_cube1, _mom0_out_cube2],
+                               ratio_map_name=ratio_fits_name,
+                               aggregated_ratio=aggregated_image_ratio,
+                               engine=engine,
+                               executed_on=executed_on,
+                               run_id=run_id)
+    return ratio_fits_name
+
+
+if __name__ == '__main__':
+    main(cube_fits_list=['test_cube.fits', 'test_cube.fits'],
+         mom0_out_cube1='test_cube_mom0.fits',
+         mom0_out_cube2='test_cube_mom0_1.fits',
+         run_id='test_run')
--- a/etl/prs/prs_density_inference.py
+++ b/etl/prs/prs_density_inference.py
--- a/etl/prs/prs_expand_modelling_with_ml.py
+++ b/etl/prs/prs_expand_modelling_with_ml.py
--- a/etl/prs/prs_expand_ratio_modelling_with_ml.py
+++ b/etl/prs/prs_expand_ratio_modelling_with_ml.py
--- a/etl/prs/prs_inspect_results.py
+++ b/etl/prs/prs_inspect_results.py
--- a/etl/prs/prs_ml_training.py
+++ b/etl/prs/prs_ml_training.py
--- a/etl/prs/prs_ml_training_ratio.py
+++ b/etl/prs/prs_ml_training_ratio.py
--- a/etl/prs/prs_poc_figures.py
+++ b/etl/prs/prs_poc_figures.py
--- a/etl/prs/prs_poc_latex_table.py
+++ b/etl/prs/prs_poc_latex_table.py
--- a/etl/requirements.txt
+++ b/etl/requirements.txt
+numpy==1.23.4
+PyYAML==6.0
+astropy==5.1
+matplotlib==3.6.2
+scipy==1.9.3
+tqdm==4.64.1
+xarray==2022.10.0
+sqlalchemy==1.4.42
+psycopg2-binary==2.9.5
+seaborn==0.12.2
+scikit-learn==1.3.0
+xgboost==2.0.0
+astropy==5.1
+keras==2.12.0
\ No newline at end of file
--- a/etl/reset_output.sh
+++ b/etl/reset_output.sh
+#!/bin/bash
+
+read -p "This will delete all of the results, are you sure? " -n 1 -r
+echo
+
+if [[ $REPLY =~ ^[Yy]$ ]]
+then
+    folder_list=("prs/fits/cubes" "prs/fits/moments" "prs/fits/ratios" "prs/fits/grids")
+    for folder_name in ${folder_list[@]}
+    do
+        rm $folder_name/*.fits
+    done
+    rm prs/output/*.png
+    rm stg/archive/*.zip
+    rm full_dataset*.csv
+fi
+
--- a/etl/stg/config/config.yml
+++ b/etl/stg/config/config.yml
--- a/etl/stg/stg_build_db_structure.py
+++ b/etl/stg/stg_build_db_structure.py
--- a/etl/stg/stg_radmc_input_generator.py
+++ b/etl/stg/stg_radmc_input_generator.py
--- a/etl/tests/__init__.py
+++ b/etl/tests/__init__.py
--- a/etl/tests/test_commons.py
+++ b/etl/tests/test_commons.py
--- a/etl/tests/test_files/credentials.yaml
+++ b/etl/tests/test_files/credentials.yaml
+DB_USER: cGlwcG8=
+DB_PASS: cGx1dG8=
+DB_HOST: localhost
+DB_NAME: postgres
+DB_PORT: 5432
\ No newline at end of file
No results found