/* Copyright 2024 INAF - Osservatorio Astronomico di Cagliari

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 */

/*! \file cluster.cp
 *
 * \brief Implementation of the calculation for a cluster of spheres.
 */
#include <chrono>
#include <cstdio>
#include <exception>
#include <fstream>
#include <string>
#include <nvtx3/nvToolsExt.h>
#ifdef _OPENMP
#include <omp.h>
#endif
#ifdef USE_MPI
#ifndef MPI_VERSION
#include <mpi.h>
#endif
#endif
#ifdef USE_MAGMA
#include <cuda_runtime.h>
#endif

#ifndef INCLUDE_TYPES_H_
#include "../include/types.h"
#endif

#ifndef INCLUDE_ERRORS_H_
#include "../include/errors.h"
#endif

#ifndef INCLUDE_LOGGING_H_
#include "../include/logging.h"
#endif

#ifndef INCLUDE_CONFIGURATION_H_
#include "../include/Configuration.h"
#endif

#ifndef INCLUDE_COMMONS_H_
#include "../include/Commons.h"
#endif

#ifndef INCLUDE_SPH_SUBS_H_
#include "../include/sph_subs.h"
#endif

#ifndef INCLUDE_CLU_SUBS_H_
#include "../include/clu_subs.h"
#endif

#ifndef INCLUDE_TRANSITIONMATRIX_H_
#include "../include/TransitionMatrix.h"
#endif

#ifndef INCLUDE_ALGEBRAIC_H_
#include "../include/algebraic.h"
#endif

using namespace std;

// I would like to put it all in a struct, but then I'd have to write a constructor for it, due to members defined as references, creating a worse nightmare than the one I'd like to simplify...

int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConfiguration *gconf, ScatteringAngles *sa, ClusterIterationData *cid, FILE *output, const string& output_path, fstream& tppoan);

/*! \brief C++ implementation of CLU
 *
 *  \param config_file: `string` Name of the configuration file.
 *  \param data_file: `string` Name of the input data file.
 *  \param output_path: `string` Directory to write the output files in.
 */
void cluster(const string& config_file, const string& data_file, const string& output_path, const mixMPI *mpidata) {
  chrono::time_point<chrono::high_resolution_clock> t_start = chrono::high_resolution_clock::now();
  chrono::duration<double> elapsed;
  string message;
  string timing_name = output_path + "/c_timing_mpi"+ to_string(mpidata->rank) +".log";
  FILE *timing_file = fopen(timing_name.c_str(), "w");
  Logger *time_logger = new Logger(LOG_DEBG, timing_file);
  Logger *logger = new Logger(LOG_DEBG);
#ifdef USE_MAGMA
  int device_count;
  cudaGetDeviceCount(&device_count);
  logger->log("DEBUG: Proc-" + to_string(mpidata->rank) + " found " + to_string(device_count) + " CUDA devices.\n", LOG_DEBG);
  logger->log("INFO: Process " + to_string(mpidata->rank) + " initializes MAGMA.\n");
  magma_int_t magma_result = magma_init();
  if (magma_result != MAGMA_SUCCESS) {
    logger->err("ERROR: Process " + to_string(mpidata->rank) + " failed to initilize MAGMA.\n");
    logger->err("PROC-" + to_string(mpidata->rank) + ": MAGMA error code " + to_string(magma_result) + "\n");
    fclose(timing_file);
    delete time_logger;
    delete logger;
    return;
  }
#endif
  // the following only happens on MPI process 0
  if (mpidata->rank == 0) {
    nvtxRangePush("Set up");
    logger->log("INFO: making legacy configuration...", LOG_INFO);
    ScattererConfiguration *sconf = NULL;
    try {
      sconf = ScattererConfiguration::from_dedfb(config_file);
    } catch(const OpenConfigurationFileException &ex) {
      logger->err("\nERROR: failed to open scatterer configuration file.\n");
      string message = "FILE: " + string(ex.what()) + "\n";
      logger->err(message);
      fclose(timing_file);
      delete time_logger;
      delete logger;
      return;
    }
    sconf->write_formatted(output_path + "/c_OEDFB");
    sconf->write_binary(output_path + "/c_TEDF");
    sconf->write_binary(output_path + "/c_TEDF.hd5", "HDF5");
    GeometryConfiguration *gconf = NULL;
    try {
      gconf = GeometryConfiguration::from_legacy(data_file);
    } catch (const OpenConfigurationFileException &ex) {
      logger->err("\nERROR: failed to open geometry configuration file.\n");
      string message = "FILE: " + string(ex.what()) + "\n";
      logger->err(message);
      if (sconf) delete sconf;
      fclose(timing_file);
      delete time_logger;
      delete logger;
      return;
    }
    logger->log(" done.\n", LOG_INFO);
    nvtxRangePop();
    int s_nsph = sconf->number_of_spheres;
    int nsph = gconf->number_of_spheres;
    if (s_nsph == nsph) {
      // Shortcuts to variables stored in configuration objects
      ScatteringAngles *p_scattering_angles = new ScatteringAngles(gconf);
      double wp = sconf->wp;
      FILE *output = fopen((output_path + "/c_OCLU").c_str(), "w");
      ClusterIterationData *cid = new ClusterIterationData(gconf, sconf, mpidata);
      const int ndi = cid->c4->nsph * cid->c4->nlim;
      np_int ndit = 2 * ndi;
      logger->log("INFO: Size of matrices to invert: " + to_string((int64_t)ndit) + " x " + to_string((int64_t)ndit) +".\n");
      time_logger->log("INFO: Size of matrices to invert: " + to_string((int64_t)ndit) + " x " + to_string((int64_t)ndit) +".\n");
      fprintf(output, " READ(IR,*)NSPH,LI,LE,MXNDM,INPOL,NPNT,NPNTTS,IAVM,ISAM\n");
      fprintf(output, " %5d%5d%5d%5ld%5d%5d%5d%5d%5d\n",
	      nsph, cid->c4->li, cid->c4->le, gconf->mxndm, gconf->in_pol, gconf->npnt,
	      gconf->npntts, gconf->iavm, gconf->iavm
	      );
      fprintf(output, " READ(IR,*)RXX(I),RYY(I),RZZ(I)\n");
      for (int ri = 0; ri < nsph; ri++) fprintf(output, "%17.8lE%17.8lE%17.8lE\n",
						gconf->get_sph_x(ri), gconf->get_sph_y(ri), gconf->get_sph_z(ri)
						);
      fprintf(output, " READ(IR,*)TH,THSTP,THLST,THS,THSSTP,THSLST\n");
      fprintf(
	      output, " %10.3lE%10.3lE%10.3lE%10.3lE%10.3lE%10.3lE\n",
	      p_scattering_angles->th, p_scattering_angles->thstp,
	      p_scattering_angles->thlst, p_scattering_angles->ths,
	      p_scattering_angles->thsstp, p_scattering_angles->thslst
	      );
      fprintf(output, " READ(IR,*)PH,PHSTP,PHLST,PHS,PHSSTP,PHSLST\n");
      fprintf(
	      output, " %10.3lE%10.3lE%10.3lE%10.3lE%10.3lE%10.3lE\n",
	      p_scattering_angles->ph, p_scattering_angles->phstp,
	      p_scattering_angles->phlst, p_scattering_angles->phs,
	      p_scattering_angles->phsstp, p_scattering_angles->phslst
	      );
      fprintf(output, " READ(IR,*)JWTM\n");
      fprintf(output, " %5d\n", gconf->jwtm);
      fprintf(output, "  READ(ITIN)NSPHT\n");
      fprintf(output, "  READ(ITIN)(IOG(I),I=1,NSPH)\n");
      fprintf(output, "  READ(ITIN)EXDC,WP,XIP,IDFC,NXI\n");
      fprintf(output, "  READ(ITIN)(XIV(I),I=1,NXI)\n");
      fprintf(output, "  READ(ITIN)NSHL(I),ROS(I)\n");
      fprintf(output, "  READ(ITIN)(RCF(I,NS),NS=1,NSH)\n");
      fprintf(output, " \n");
      str(sconf, cid->c1, cid->c1ao, cid->c3, cid->c4, cid->c6);
      thdps(cid->c4->lm, cid->zpv);
      double exdc = sconf->exdc;
      double exri = sqrt(exdc);
      fprintf(output, "  REFR. INDEX OF EXTERNAL MEDIUM=%15.7lE\n", exri);
      fstream *tppoanp = new fstream;
      fstream &tppoan = *tppoanp;
      string tppoan_name = output_path + "/c_TPPOAN";
      tppoan.open(tppoan_name.c_str(), ios::out | ios::binary);
      if (tppoan.is_open()) {
#ifdef USE_MAGMA
	logger->log("INFO: using MAGMA calls.\n", LOG_INFO);
#elif defined USE_LAPACK
	logger->log("INFO: using LAPACK calls.\n", LOG_INFO);
#else
	logger->log("INFO: using fall-back lucin() calls.\n", LOG_INFO);
#endif
	int iavm = gconf->iavm;
	int isam = gconf->isam;
	int inpol = gconf->in_pol;
	int nxi = sconf->number_of_scales;
	int nth = p_scattering_angles->nth;
	int nths = p_scattering_angles->nths;
	int nph = p_scattering_angles->nph;
	int nphs = p_scattering_angles->nphs;
	tppoan.write(reinterpret_cast<char *>(&iavm), sizeof(int));
	tppoan.write(reinterpret_cast<char *>(&isam), sizeof(int));
	tppoan.write(reinterpret_cast<char *>(&inpol), sizeof(int));
	tppoan.write(reinterpret_cast<char *>(&nxi), sizeof(int));
	tppoan.write(reinterpret_cast<char *>(&nth), sizeof(int));
	tppoan.write(reinterpret_cast<char *>(&nph), sizeof(int));
	tppoan.write(reinterpret_cast<char *>(&nths), sizeof(int));
	tppoan.write(reinterpret_cast<char *>(&nphs), sizeof(int));
	if (sconf->idfc < 0) {
	  cid->vk = cid->xip * cid->wn;
	  fprintf(output, "  VK=%15.7lE, XI IS SCALE FACTOR FOR LENGTHS\n", cid->vk);
	  fprintf(output, " \n");
	}
	// do the first iteration on jxi488 separately, since it seems to be different from the others
	int jxi488 = 1;
	chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now();
	nvtxRangePush("First iteration");
	int jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, output, output_path, tppoan);
	nvtxRangePop();
	chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now();
	elapsed = start_iter_1 - t_start;
	string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n";
	logger->log(message);
	time_logger->log(message);
	elapsed = end_iter_1 - start_iter_1;
	message = "INFO: First iteration took " + to_string(elapsed.count()) + "s.\n";
	logger->log(message);
	time_logger->log(message);
	if (jer != 0) {
	  // First loop failed. Halt the calculation.
	  tppoan.close();
	  fclose(timing_file);
	  fclose(output);
	  delete p_scattering_angles;
	  delete cid;
	  delete logger;
	  delete time_logger;
	  delete sconf;
	  delete gconf;
	  return;
	}

	// here go the calls that send data to be duplicated on other MPI processes from process 0 to others, using MPI broadcasts, but only if MPI is actually used
#ifdef MPI_VERSION
	if (mpidata->mpirunning) {
	  gconf->mpibcast(mpidata);
	  sconf->mpibcast(mpidata);	    
	  cid->mpibcast(mpidata);
	  p_scattering_angles->mpibcast(mpidata);
	}	
#endif
	// Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled
	int ompnumthreads = 1;

	nvtxRangePush("Parallel loop");
#pragma omp parallel
	{
	  // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway
	  int myompthread = 0;
#ifdef _OPENMP
	  // If OpenMP is enabled, give actual values to myompthread and ompnumthreads, and open thread-local output files
	  myompthread = omp_get_thread_num();
	  if (myompthread == 0) ompnumthreads = omp_get_num_threads();
#endif
	  // To test parallelism, I will now start feeding this function with "clean" copies of the parameters, so that they will not be changed by previous iterations, and each one will behave as the first one. Define all (empty) variables here, so they have the correct scope, then they get different definitions depending on thread number
	  ClusterIterationData *cid_2 = NULL;
	  FILE *output_2 = NULL;
	  fstream *tppoanp_2 = NULL;
	  // for threads other than the 0, create distinct copies of all relevant data, while for thread 0 just define new references / pointers to the original ones
	  if (myompthread == 0) {
	    cid_2 = cid;
	    output_2 = output;
	    tppoanp_2 = tppoanp;
	  } else {
	    // this is not thread 0, so do create fresh copies of all local variables
	    cid_2 = new ClusterIterationData(*cid);
	    output_2 = fopen((output_path + "/c_OCLU_" + to_string(mpidata->rank) + "_" + to_string(myompthread)).c_str(), "w");
	    tppoanp_2 = new fstream;
	    tppoanp_2->open((output_path + "/c_TPPOAN_" + to_string(mpidata->rank) + "_" + to_string(myompthread)).c_str(), ios::out | ios::binary);
	  }
	  fstream &tppoan_2 = *tppoanp_2;
	  // make sure all threads align here: I don't want the following loop to accidentally start for thread 0, possibly modifying some variables before they are copied by all other threads
#pragma omp barrier
	  if (myompthread==0) {
	    logger->log("Syncing OpenMP threads and starting the loop on wavelengths\n");
	  }
	  // ok, now I can actually start the parallel calculations
#pragma omp for
	  for (jxi488 = cid_2->firstxi; jxi488 <= cid_2->lastxi; jxi488++) {
	    int jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid_2, output_2, output_path, *tppoanp_2);
	  }

#pragma omp barrier
	  // only threads different from 0 have to free local copies of variables and close local files
	  if (myompthread != 0) {
	    delete cid_2;
	    fclose(output_2);
	    tppoanp_2->close();
	    delete tppoanp_2;
	  }
#pragma omp barrier
	  {
	    string message = "INFO: Closing thread-local output files of thread " + to_string(myompthread) + " and syncing threads.\n";
	    logger->log(message);
	  }
	} // closes pragma omp parallel
	nvtxRangePop();

	nvtxRangePush("Output concatenation");
#ifdef _OPENMP
#pragma omp barrier
	{
	  // thread 0 already wrote on global files, skip it and take care of appending the others
	  for (int ri = 1; ri < ompnumthreads; ri++) {
	    string partial_file_name = output_path + "/c_OCLU_" + to_string(mpidata->rank) + "_" + to_string(ri);
	    string message = "Copying ASCII output in MPI process " + to_string(mpidata->rank) + " of thread " + to_string(ri) + " of " + to_string(ompnumthreads - 1) + "... ";
	    logger->log(message, LOG_DEBG);
	    FILE *partial_output = fopen(partial_file_name.c_str(), "r");
	    int c = fgetc(partial_output);
	    while (c != EOF) {
	      fputc(c, output);
	      c = fgetc(partial_output);
	    }
	    fclose(partial_output);
	    remove(partial_file_name.c_str());
	    logger->log("done.\n", LOG_DEBG);
	    partial_file_name = output_path + "/c_TPPOAN_" + to_string(mpidata->rank) + "_" + to_string(ri);
	    message = "Copying binary output in MPI process " + to_string(mpidata->rank) + " of thread " + to_string(ri) + " of " + to_string(ompnumthreads - 1) + "... ";
	    logger->log(message, LOG_DEBG);
	    fstream partial_tppoan;
	    partial_tppoan.open(partial_file_name.c_str(), ios::in | ios::binary);
	    partial_tppoan.seekg(0, ios::end);
	    long buffer_size = partial_tppoan.tellg();
	    char *binary_buffer = new char[buffer_size];
	    partial_tppoan.seekg(0, ios::beg);
	    partial_tppoan.read(binary_buffer, buffer_size);
	    tppoan.write(binary_buffer, buffer_size);
	    partial_tppoan.close();
	    delete[] binary_buffer;
	    remove(partial_file_name.c_str());
	    logger->log("done.\n", LOG_DEBG);
	  }
	}
#endif
	// here go the code to append the files written in MPI processes > 0 to the ones on MPI process 0
#ifdef MPI_VERSION
	if (mpidata->mpirunning) {
	  // only go through this if MPI has been actually used
	  for (int rr=1; rr<mpidata->nprocs; rr++) {
	    // get the data from process rr
	    // how many openmp threads did process rr use?
	    int remotethreads;
	    MPI_Recv(&remotethreads, 1, MPI_INT, rr, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	    for (int ri=0; ri<remotethreads; ri++) {
	      // first get the ASCII local file
	      char *chunk_buffer;
	      int chunk_buffer_size = -1;
	      MPI_Recv(&chunk_buffer_size, 1, MPI_INT, rr, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	      while (chunk_buffer_size != 0) {
		char *chunk_buffer = new char[chunk_buffer_size];
		MPI_Recv(chunk_buffer, chunk_buffer_size, MPI_CHAR, rr, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
		fputs(chunk_buffer, output);
		delete[] chunk_buffer;
		MPI_Recv(&chunk_buffer_size, 1, MPI_INT, rr, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	      }
	      fprintf(output, "\n");

	      // now get the binary local file
	      long buffer_size = 0;
	      // get the size of the buffer
	      MPI_Recv(&buffer_size, 1, MPI_LONG, rr, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	      // allocate the bufer
	      char *binary_buffer = new char[buffer_size];
	      // actually receive the buffer
	      MPI_Recv(binary_buffer, buffer_size, MPI_CHAR, rr, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	      // we can write it to disk
	      tppoan.write(binary_buffer, buffer_size);
	      delete[] binary_buffer;
	    }
	  }
	}
#endif
	nvtxRangePop();
	tppoanp->close();
	delete tppoanp;
      } else { // In case TPPOAN could not be opened. Should never happen.
	logger->err("\nERROR: failed to open TPPOAN file.\n");
      }
      fclose(output);
      // Clean memory
      delete cid;
      delete p_scattering_angles;
    } else { // NSPH mismatch between geometry and scatterer configurations.
      throw UnrecognizedConfigurationException(
					       "Inconsistent geometry and scatterer configurations."
					       );
    }
    delete sconf;
    delete gconf;
    chrono::time_point<chrono::high_resolution_clock> t_end = chrono::high_resolution_clock::now();
    elapsed = t_end - t_start;
    string message = "INFO: Calculation lasted " + to_string(elapsed.count()) + "s.\n";
    logger->log(message);
    logger->log("Finished: output written to " + output_path + "/c_OCLU\n");
    time_logger->log(message);
  }

#ifdef MPI_VERSION
  else {
    // here go the code for MPI processes other than 0
    // copy gconf, sconf, cid and p_scattering_angles from MPI process 0
    GeometryConfiguration *gconf = new GeometryConfiguration(mpidata);
    ScattererConfiguration *sconf = new ScattererConfiguration(mpidata);
    ClusterIterationData *cid = new ClusterIterationData(mpidata);
    ScatteringAngles *p_scattering_angles = new ScatteringAngles(mpidata);
    // open separate files for other MPI processes
    // File *output = fopen((output_path + "/c_OCLU_mpi"+ to_string(mpidata->rank)).c_str(), "w");
    // fstream *tppoanp = new fstream;
    // fstream &tppoan = *tppoanp;
    // string tppoan_name = output_path + "/c_TPPOAN_mpi"+ to_string(mpidata->rank);
    // tppoan.open(tppoan_name.c_str(), ios::out | ios::binary);
    // Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled
    int ompnumthreads = 1;

#pragma omp parallel
    {
      // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway
      int myompthread = 0;
#ifdef _OPENMP
      // If OpenMP is enabled, give actual values to myompthread and ompnumthreads, and open thread-local output files
      myompthread = omp_get_thread_num();
      if (myompthread == 0) ompnumthreads = omp_get_num_threads();
#endif
      // To test parallelism, I will now start feeding this function with "clean" copies of the parameters, so that they will not be changed by previous iterations, and each one will behave as the first one. Define all (empty) variables here, so they have the correct scope, then they get different definitions depending on thread number
      ClusterIterationData *cid_2 = NULL;
      FILE *output_2 = NULL;
      fstream *tppoanp_2 = NULL;
      // for threads other than the 0, create distinct copies of all relevant data, while for thread 0 just define new references / pointers to the original ones
      if (myompthread == 0) {
	cid_2 = cid;
	// output_2 = output;
	// tppoanp_2 = tppoanp;
      } else {
	// this is not thread 0, so do create fresh copies of all local variables
	cid_2 = new ClusterIterationData(*cid);
      }
      output_2 = fopen((output_path + "/c_OCLU_" + to_string(mpidata->rank) + "_" + to_string(myompthread)).c_str(), "w");
      tppoanp_2 = new fstream;
      tppoanp_2->open((output_path + "/c_TPPOAN_" + to_string(mpidata->rank) + "_" + to_string(myompthread)).c_str(), ios::out | ios::binary);
      fstream &tppoan_2 = *tppoanp_2;
      // make sure all threads align here: I don't want the following loop to accidentally start for thread 0, possibly modifying some variables before they are copied by all other threads
#pragma omp barrier
      if (myompthread==0) logger->log("Syncing OpenMP threads and starting the loop on wavelengths\n");
      // ok, now I can actually start the parallel calculations
#pragma omp for
      for (int jxi488 = cid_2->firstxi; jxi488 <= cid_2->lastxi; jxi488++) {
	int jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid_2, output_2, output_path, *tppoanp_2);
      }

#pragma omp barrier
      // only threads different from 0 have to free local copies of variables
      if (myompthread != 0) {
	delete cid_2;
      }
      fclose(output_2);
      tppoanp_2->close();
      delete tppoanp_2;
#pragma omp barrier
      {
	string message = "INFO: Closing thread-local output files of thread " + to_string(myompthread) + " and syncing threads.\n";
	logger->log(message);
      }
    } // closes pragma omp parallel
#pragma omp barrier
    {
      // tell MPI process 0 how many threads we have on this process (not necessarily the same across all processes)
      MPI_Send(&ompnumthreads, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
      // reopen local files, send them all to MPI process 0
      for (int ri = 0; ri < ompnumthreads; ri++) {
	string partial_file_name = output_path + "/c_OCLU_" + to_string(mpidata->rank) + "_" + to_string(ri);
	string message = "Copying ASCII output in MPI process " + to_string(mpidata->rank) + " of thread " + to_string(ri) + " of " + to_string(ompnumthreads - 1) + "... ";
	logger->log(message, LOG_DEBG);
	fstream partial_output;
	partial_output.open(partial_file_name.c_str(), ios::in | ios::binary);
	partial_output.seekg(0, ios::end);
	const long partial_output_size = partial_output.tellg();
	partial_output.close();
	partial_output.open(partial_file_name.c_str(), ios::in | ios::binary);
	int chunk_buffer_size = 25165824; // Length of char array  with 24Mb size
	char *chunk_buffer = new char[chunk_buffer_size]();
	int full_chunks = (int)(partial_output_size / chunk_buffer_size);
	for (int fi = 0; fi < full_chunks; fi++) {
	  partial_output.read(chunk_buffer, chunk_buffer_size);
	  // If EOF is reached, do not send EOF character.
	  long ptr_position = partial_output.tellg();
	  if (ptr_position == partial_output_size) {
	    chunk_buffer[chunk_buffer_size - 1] = '\0';
	  }
	  // Send the size of the buffer that is being transmitted (Node-0 does not know whether it is full or not)
	  MPI_Send(&chunk_buffer_size, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
	  // Actually send the file contents to Node-0
	  MPI_Send(chunk_buffer, chunk_buffer_size, MPI_CHAR, 0, 0, MPI_COMM_WORLD);
	}
	long ptr_position = partial_output.tellg();
	if (ptr_position < partial_output_size) {
	  // Send the last partial buffer
	  chunk_buffer_size = partial_output_size - ptr_position;
	  delete[] chunk_buffer;
	  chunk_buffer = new char[chunk_buffer_size];
	  partial_output.read(chunk_buffer, chunk_buffer_size);
	  chunk_buffer[chunk_buffer_size - 1] = '\0';
	  // Send the size of the buffer that is being transmitted (Node-0 does not know whether it is full or not)
	  MPI_Send(&chunk_buffer_size, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
	  // Actually send the file contents to Node-0
	  MPI_Send(chunk_buffer, chunk_buffer_size, MPI_CHAR, 0, 0, MPI_COMM_WORLD);
	}
	// Send a size 0 flag to inform Node-0 that the transmission is complete
	chunk_buffer_size = 0;
	MPI_Send(&chunk_buffer_size, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
	partial_output.close();
	delete[] chunk_buffer;
	remove(partial_file_name.c_str());
	logger->log("done.\n", LOG_DEBG);
	
	partial_file_name = output_path + "/c_TPPOAN_" + to_string(mpidata->rank) + "_" + to_string(ri);
	message = "Copying binary output in MPI process " + to_string(mpidata->rank) + " of thread " + to_string(ri) + " of " + to_string(ompnumthreads - 1) + "... ";
	logger->log(message, LOG_DEBG);
	fstream partial_tppoan;
	partial_tppoan.open(partial_file_name.c_str(), ios::in | ios::binary);
	partial_tppoan.seekg(0, ios::end);
	long buffer_size = partial_tppoan.tellg();
	char *binary_buffer = new char[buffer_size];
	partial_tppoan.seekg(0, ios::beg);
	partial_tppoan.read(binary_buffer, buffer_size);
	// tell MPI process 0 how large is the buffer
	MPI_Send(&buffer_size, 1, MPI_LONG, 0, 1, MPI_COMM_WORLD);
	// actually send the buffer
	MPI_Send(binary_buffer, buffer_size, MPI_CHAR, 0, 0, MPI_COMM_WORLD);
	// tppoan.write(binary_buffer, buffer_size);
	partial_tppoan.close();
	delete[] binary_buffer;
	remove(partial_file_name.c_str());
	logger->log("done.\n", LOG_DEBG);
      }
    }
    // Clean memory
    delete cid;
    delete p_scattering_angles;
    delete sconf;
    delete gconf;

  }
#endif
#ifdef USE_MAGMA
  logger->log("INFO: Process " + to_string(mpidata->rank) + " finalizes MAGMA.\n");
  magma_finalize();
#endif
  fclose(timing_file);
  delete time_logger;
  delete logger;
}

int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConfiguration *gconf, ScatteringAngles *sa, ClusterIterationData *cid, FILE *output, const string& output_path, fstream& tppoan)
{
  int nxi = sconf->number_of_scales;
  string message = "INFO: running scale iteration " + to_string(jxi488) + " of " + to_string(nxi) + ".\n";
  Logger *logger = new Logger(LOG_DEBG);
  logger->log(message);
  chrono::duration<double> elapsed;
  chrono::time_point<chrono::high_resolution_clock> interval_start, interval_end;
  int jer = 0;
  int lcalc = 0;
  int jaw = 1;
  int li = gconf->li;
  int le = gconf->le;
  int lm = 0;
  if (le > lm) lm = le;
  if (li > lm) lm = li;
  int nsph = gconf->number_of_spheres;
  np_int mxndm = gconf->mxndm;
  int iavm = gconf->iavm;
  int inpol = gconf->in_pol;
  int npnt = gconf->npnt;
  int npntts = gconf->npntts;
  int isam = gconf->iavm;
  int jwtm = gconf->jwtm;
  np_int ndit = 2 * nsph * cid->c4->nlim;
  int isq, ibf;

  nvtxRangePush("Prepare matrix calculation");
  fprintf(output, "========== JXI =%3d ====================\n", jxi488);
  double xi = sconf->get_scale(jxi488 - 1);
  double exdc = sconf->exdc;
  double exri = sqrt(exdc);
  int idfc = (int)sconf->idfc;
  double vkarg = 0.0;
  if (idfc >= 0) {
    cid->vk = xi * cid->wn;
    vkarg = cid->vk;
    fprintf(output, "  VK=%15.7lE, XI=%15.7lE\n", cid->vk, xi);
  } else {
    vkarg = xi * cid->vk;
    cid->sqsfi = 1.0 / (xi * xi);
    fprintf(output, "  XI=%15.7lE\n", xi);
  }
  hjv(exri, vkarg, jer, lcalc, cid->arg, cid->c1, cid->c1ao, cid->c4);
  if (jer != 0) {
    fprintf(output, "  STOP IN HJV\n");
    return jer;
    // break; // rewrite this to go to the end of the function, to free locally allocated variables and return jer
  }
  for (int i132 = 1; i132 <= nsph; i132++) {
    int iogi = cid->c1->iog[i132 - 1];
    if (iogi != i132) {
      for (int l123 = 1; l123 <= li; l123++) {
	cid->c1->rmi[l123 - 1][i132 - 1] = cid->c1->rmi[l123 - 1][iogi - 1];
	cid->c1->rei[l123 - 1][i132 - 1] = cid->c1->rei[l123 - 1][iogi - 1];
      } // l123 loop
    } else {
      int nsh = cid->c1->nshl[i132 - 1];
      int ici = (nsh + 1) / 2;
      if (idfc == 0) {
	for (int ic = 0; ic < ici; ic++)
	  cid->c2->dc0[ic] = sconf->get_dielectric_constant(ic, i132 - 1, jxi488 - 1);
      } else {
	if (jxi488 == 1) {
	  for (int ic = 0; ic < ici; ic++)
	    cid->c2->dc0[ic] = sconf->get_dielectric_constant(ic, i132 - 1, 0);
	}
      }
      if (nsh % 2 == 0) cid->c2->dc0[ici] = exdc;
      dme(
	  cid->c4->li, i132, npnt, npntts, vkarg, exdc, exri,
	  cid->c1, cid->c2, jer, lcalc, cid->arg
	  );
      if (jer != 0) {
	fprintf(output, "  STOP IN DME\n");
	return jer;
	//break;
      }
    }
    if (jer != 0) {
      return jer;
      //break;
    }
  } // i132 loop
  nvtxRangePop();
  interval_start = chrono::high_resolution_clock::now();
  nvtxRangePush("Calculate inverted matrix");
  cms(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6);
  nvtxRangePop();
  interval_end = chrono::high_resolution_clock::now();
  elapsed = interval_end - interval_start;
  message = "INFO: matrix calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
  logger->log(message);
  interval_start = chrono::high_resolution_clock::now();
  nvtxRangePush("Invert the matrix");
  invert_matrix(cid->am, ndit, jer, mxndm);
  nvtxRangePop();
  interval_end = chrono::high_resolution_clock::now();
  elapsed = interval_end - interval_start;
  message = "INFO: matrix inversion for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
  logger->log(message);
  if (jer != 0) {
    message = "ERROR: matrix inversion ended with error code " + to_string(jer) + ".\n";
    logger->err(message);
    return jer;
    // break; // jxi488 loop: goes to memory clean
  }
  interval_start = chrono::high_resolution_clock::now();
  nvtxRangePush("Average calculation");
  ztm(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6, cid->c9);
  if (idfc >= 0) {
    if (jxi488 == jwtm) {
      int nlemt = 2 * cid->c4->nlem;
      string ttms_name = output_path + "/c_TTMS.hd5";
      TransitionMatrix::write_binary(ttms_name, nlemt, lm, cid->vk, exri, cid->c1ao->am0m, "HDF5");
      ttms_name = output_path + "/c_TTMS";
      TransitionMatrix::write_binary(ttms_name, nlemt, lm, cid->vk, exri, cid->c1ao->am0m);
    }
  }
  // label 156: continue from here
  if (inpol == 0) {
    fprintf(output, "   LIN\n");
  } else { // label 158
    fprintf(output, "  CIRC\n");
  }
  // label 160
  double cs0 = 0.25 * cid->vk * cid->vk * cid->vk / acos(0.0);
  double csch = 0.0, qschu = 0.0, pschu = 0.0, s0mag = 0.0;
  dcomplex s0 = 0.0 + 0.0 * I;
  scr0(cid->vk, exri, cid->c1, cid->c1ao, cid->c3, cid->c4);
  double sqk = cid->vk * cid->vk * exdc;
  aps(cid->zpv, cid->c4->li, nsph, cid->c1, sqk, cid->gaps);
  rabas(inpol, cid->c4->li, nsph, cid->c1, cid->tqse, cid->tqspe, cid->tqss, cid->tqsps);
  if (cid->c4->li != cid->c4->le) fprintf(output, "     SPHERES; LMX=LI\n");
  for (int i170 = 1; i170 <= nsph; i170++) {
    if (cid->c1->iog[i170 - 1] >= i170) {
      int i = i170 - 1;
      double albeds = cid->c1->sscs[i] / cid->c1->sexs[i];
      cid->c1->sqscs[i] *= cid->sqsfi;
      cid->c1->sqabs[i] *= cid->sqsfi;
      cid->c1->sqexs[i] *= cid->sqsfi;
      fprintf(output, "     SPHERE %2d\n", i170);
      if (cid->c1->nshl[i] != 1) {
	fprintf(output, "  SIZE=%15.7lE\n", cid->c2->vsz[i]);
      } else { // label 162
	fprintf(output, "  SIZE=%15.7lE, REFRACTIVE INDEX=%15.7lE%15.7lE\n", cid->c2->vsz[i], real(cid->c2->vkt[i]), imag(cid->c2->vkt[i]));
      }
      // label 164
      fprintf(output, " ----- SCS ----- ABS ----- EXS ----- ALBEDS --\n");
      fprintf(output, " %14.7lE%15.7lE%15.7lE%15.7lE\n", cid->c1->sscs[i], cid->c1->sabs[i], cid->c1->sexs[i], albeds);
      fprintf(output, " ---- SCS/GS -- ABS/GS -- EXS/GS ---\n");
      fprintf(output, " %14.7lE%15.7lE%15.7lE\n", cid->c1->sqscs[i], cid->c1->sqabs[i], cid->c1->sqexs[i]);
      fprintf(output, "  FSAS=%15.7lE%15.7lE\n", real(cid->c1->fsas[i]), imag(cid->c1->fsas[i]));
      csch = 2.0 * cid->vk * cid->sqsfi / cid->c1->gcsv[i];
      s0 = cid->c1->fsas[i] * exri;
      qschu = imag(s0) * csch;
      pschu = real(s0) * csch;
      s0mag = cabs(s0) * cs0;
      fprintf(output, "  QSCHU=%15.7lE, PSCHU=%15.7lE, S0MAG=%15.7lE\n", qschu, pschu, s0mag);
      double rapr = cid->c1->sexs[i] - cid->gaps[i];
      double cosav = cid->gaps[i] / cid->c1->sscs[i];
      fprintf(output, "  COSAV=%15.7lE, RAPRS=%15.7lE\n", cosav, rapr);
      fprintf(output, "  IPO= 1, TQEk=%15.7lE, TQSk=%15.7lE\n", cid->tqse[0][i], cid->tqss[0][i]);
      fprintf(output, "  IPO= 2, TQEk=%15.7lE, TQSk=%15.7lE\n", cid->tqse[1][i], cid->tqss[1][i]);
    }
  } // i170 loop
  fprintf(output, "  FSAT=%15.7lE%15.7lE\n", real(cid->c3->tfsas), imag(cid->c3->tfsas));
  csch = 2.0 * cid->vk * cid->sqsfi / cid->c3->gcs;
  s0 = cid->c3->tfsas * exri;
  qschu = imag(s0) * csch;
  pschu = real(s0) * csch;
  s0mag = cabs(s0) * cs0;
  fprintf(output, "  QSCHU=%15.7lE, PSCHU=%15.7lE, S0MAG=%15.7lE\n", qschu, pschu, s0mag);
  tppoan.write(reinterpret_cast<char *>(&(cid->vk)), sizeof(double));
  pcrsm0(cid->vk, exri, inpol, cid->c1, cid->c1ao, cid->c4);
  apcra(cid->zpv, cid->c4->le, cid->c1ao->am0m, inpol, sqk, cid->gapm, cid->gappm);
  nvtxRangePop();
  interval_end = chrono::high_resolution_clock::now();
  elapsed = interval_end - interval_start;
  message = "INFO: average calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
  logger->log(message);
  interval_start = chrono::high_resolution_clock::now();
  nvtxRangePush("Angle loop");
  double th = sa->th;
  for (int jth486 = 1; jth486 <= sa->nth; jth486++) { // OpenMP portable?
    double ph = sa->ph;
    double cost = 0.0, sint = 0.0, cosp = 0.0, sinp = 0.0;
    for (int jph484 = 1; jph484 <= sa->nph; jph484++) {
      int jw = 0;
      if (sa->nk != 1 || jxi488 <= 1) {
	upvmp(th, ph, 0, cost, sint, cosp, sinp, cid->u, cid->upmp, cid->unmp);
	if (isam >= 0) {
	  wmamp(
		0, cost, sint, cosp, sinp, inpol, cid->c4->le, 0,
		nsph, cid->argi, cid->u, cid->upmp, cid->unmp, cid->c1
		);
	  // label 182
	  apc(cid->zpv, cid->c4->le, cid->c1ao->am0m, cid->c1->w, sqk, cid->gap, cid->gapp);
	  raba(cid->c4->le, cid->c1ao->am0m, cid->c1->w, cid->tqce, cid->tqcpe, cid->tqcs, cid->tqcps);
	  jw = 1;
	}
      } else { // label 180, NK == 1 AND JXI488 == 1
	if (isam >= 0) {
	  // label 182
	  apc(cid->zpv, cid->c4->le, cid->c1ao->am0m, cid->c1->w, sqk, cid->gap, cid->gapp);
	  raba(cid->c4->le, cid->c1ao->am0m, cid->c1->w, cid->tqce, cid->tqcpe, cid->tqcs, cid->tqcps);
	  jw = 1;
	}
      }
      // label 184
      double thsl = sa->ths;
      double phsph = 0.0;
      for (int jths = 1; jths <= sa->nths; jths++) {
	double ths = thsl;
	int icspnv = 0;
	if (isam > 1) ths += sa->thsca;
	if (isam >= 1) {
	  phsph = 0.0;
	  if (ths < 0.0 || ths > 180.0) phsph = 180.0;
	  if (ths < 0.0) ths *= -1.0;
	  if (ths > 180.0) ths = 360.0 - ths;
	  if (phsph != 0.0) icspnv = 1;
	}
	// label 186
	double phs = sa->phs;
	for (int jphs = 1; jphs <= sa->nphs; jphs++) {
	  double costs = 0.0, sints = 0.0, cosps = 0.0, sinps = 0.0;
	  if (isam >= 1) {
	    phs = sa->ph + phsph;
	    if (phs > 360.0) phs -= 360.0;
	  }
	  // label 188
	  bool goto190 = (sa->nks == 1 && (jxi488 > 1 || jth486 > 1 || jph484 > 1));
	  if (!goto190) {
	    upvmp(ths, phs, icspnv, costs, sints, cosps, sinps, cid->us, cid->upsmp, cid->unsmp);
	    if (isam >= 0)
	      wmamp(
		    2, costs, sints, cosps, sinps, inpol, cid->c4->le,
		    0, nsph, cid->args, cid->us, cid->upsmp, cid->unsmp, cid->c1
		    );
	  }
	  // label 190
	  if (sa->nkks != 1 || jxi488 <= 1) {
	    upvsp(
		  cid->u, cid->upmp, cid->unmp, cid->us, cid->upsmp, cid->unsmp, cid->up, cid->un, cid->ups, cid->uns,
		  cid->duk, isq, ibf, cid->scan, cid->cfmp, cid->sfmp, cid->cfsp, cid->sfsp
		  );
	    if (isam < 0) {
	      wmasp(
		    cost, sint, cosp, sinp, costs, sints, cosps, sinps,
		    cid->u, cid->up, cid->un, cid->us, cid->ups, cid->uns, isq, ibf, inpol, cid->c4->le,
		    0, nsph, cid->argi, cid->args, cid->c1
		    );
	    } else { // label 192
	      for (int i193 = 0; i193 < 3; i193++) {
		cid->up[i193] = cid->upmp[i193];
		cid->un[i193] = cid->unmp[i193];
		cid->ups[i193] = cid->upsmp[i193];
		cid->uns[i193] = cid->unsmp[i193];
	      }
	    }
	  }
	  // label 194
	  if (iavm == 1) crsm1(cid->vk, exri, cid->c1, cid->c1ao, cid->c4, cid->c6);
	  if (isam < 0) {
	    apc(cid->zpv, cid->c4->le, cid->c1ao->am0m, cid->c1->w, sqk, cid->gap, cid->gapp);
	    raba(cid->c4->le, cid->c1ao->am0m, cid->c1->w, cid->tqce, cid->tqcpe, cid->tqcs, cid->tqcps);
	    jw = 1;
	  }
	  // label 196
	  tppoan.write(reinterpret_cast<char *>(&th), sizeof(double));
	  tppoan.write(reinterpret_cast<char *>(&ph), sizeof(double));
	  tppoan.write(reinterpret_cast<char *>(&ths), sizeof(double));
	  tppoan.write(reinterpret_cast<char *>(&phs), sizeof(double));
	  tppoan.write(reinterpret_cast<char *>(&(cid->scan)), sizeof(double));
	  if (jaw != 0) {
	    jaw = 0;
	    mextc(cid->vk, exri, cid->c1ao->fsacm, cid->cextlr, cid->cext);
	    // We now have some implicit loops writing to binary
	    for (int i = 0; i < 4; i++) {
	      for (int j = 0; j < 4; j++) {
		double value = cid->cext[i][j];
		tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      }
	    }
	    for (int i = 0; i < 2; i++) {
	      double value = cid->c1ao->scscm[i];
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      value = real(cid->c1ao->scscpm[i]);
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      value = imag(cid->c1ao->scscpm[i]);
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      value = cid->c1ao->ecscm[i];
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      value = real(cid->c1ao->ecscpm[i]);
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      value = imag(cid->c1ao->ecscpm[i]);
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	    }
	    for (int i = 0; i < 3; i++) {
	      for (int j = 0; j < 2; j++) {
		double value = cid->gapm[i][j];
		tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
		value = real(cid->gappm[i][j]);
		tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
		value = imag(cid->gappm[i][j]);
		tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      }
	    }
	    fprintf(output, "     CLUSTER (ENSEMBLE AVERAGE, MODE%2d)\n", iavm);
	    int jlr = 2;
	    for (int ilr210 = 1; ilr210 <= 2; ilr210++) {
	      int ipol = (ilr210 % 2 == 0) ? 1 : -1;
	      if (ilr210 == 2) jlr = 1;
	      double extsm = cid->c1ao->ecscm[ilr210 - 1];
	      double qextm = extsm * cid->sqsfi / cid->c3->gcs;
	      double extrm = extsm / cid->c3->ecs;
	      double scasm = cid->c1ao->scscm[ilr210 - 1];
	      double albdm = scasm / extsm;
	      double qscam = scasm * cid->sqsfi / cid->c3->gcs;
	      double scarm = scasm / cid->c3->scs;
	      double abssm = extsm - scasm;
	      double qabsm = abssm * cid->sqsfi / cid->c3->gcs;
	      double absrm = abssm / cid->c3->acs;
	      double acsecs = cid->c3->acs / cid->c3->ecs;
	      if (acsecs >= -1.0e-6 && acsecs <= 1.0e-6) absrm = 1.0;
	      dcomplex s0m = cid->c1ao->fsacm[ilr210 - 1][ilr210 - 1] * exri;
	      double qschum = imag(s0m) * csch;
	      double pschum = real(s0m) * csch;
	      double s0magm = cabs(s0m) * cs0;
	      double rfinrm = real(cid->c1ao->fsacm[ilr210 - 1][ilr210 - 1]) / real(cid->c3->tfsas);
	      double extcrm = imag(cid->c1ao->fsacm[ilr210 - 1][ilr210 - 1]) / imag(cid->c3->tfsas);
	      if (inpol == 0) {
		fprintf(output, "   LIN %2d\n", ipol);
	      } else { // label 206
		fprintf(output, "  CIRC %2d\n", ipol);
	      }
	      // label 208
	      fprintf(output, " ----- SCC ----- ABC ----- EXC ----- ALBEDC --\n");
	      fprintf(output, " %14.7lE%15.7lE%15.7lE%15.7lE\n", scasm, abssm, extsm, albdm);
	      fprintf(output, " --- SCC/TGS - ABC/TGS - EXC/TGS ---\n");
	      fprintf(output, " %14.7lE%15.7lE%15.7lE\n", qscam, qabsm, qextm);
	      fprintf(output, " ---- SCCRT --- ABCRT --- EXCRT ----\n");
	      fprintf(output, " %14.7lE%15.7lE%15.7lE\n", scarm, absrm, extrm);
	      fprintf(
		      output, "  FSAC(%1d,%1d)=%15.7lE%15.7lE   FSAC(%1d,%1d)=%15.7lE%15.7lE\n",
		      ilr210, ilr210, real(cid->c1ao->fsacm[ilr210 - 1][ilr210 - 1]),
		      imag(cid->c1ao->fsacm[ilr210 - 1][ilr210 - 1]), jlr, ilr210,
		      real(cid->c1ao->fsacm[jlr - 1][ilr210 - 1]), imag(cid->c1ao->fsacm[jlr - 1][ilr210 - 1])
		      );
	      fprintf(
		      output, "  RE(FSAC(%1d,%1d))/RE(TFSAS)=%15.7lE, IM(FSAC(%1d,%1d))/IM(TFSAS)=%15.7lE\n",
		      ilr210, ilr210, rfinrm, ilr210, ilr210, extcrm
		      );
	      fprintf(output, "  QSCHU=%15.7lE, PSCHU=%15.7lE, S0MAG=%15.7lE\n", qschum, pschum, s0magm);
	      double rapr = cid->c1ao->ecscm[ilr210 - 1] - cid->gapm[2][ilr210 - 1];
	      double cosav = cid->gapm[2][ilr210 - 1] / cid->c1ao->scscm[ilr210 - 1];
	      double fz = rapr;
	      fprintf(output, "  COSAV=%15.7lE, RAPRS=%15.7lE\n", cosav, rapr);
	      fprintf(output, "  Fk=%15.7lE\n", fz);
	    } // ilr210 loop
	    double rmbrif = (real(cid->c1ao->fsacm[0][0]) - real(cid->c1ao->fsacm[1][1])) / real(cid->c1ao->fsacm[0][0]);
	    double rmdchr = (imag(cid->c1ao->fsacm[0][0]) - imag(cid->c1ao->fsacm[1][1])) / imag(cid->c1ao->fsacm[0][0]);
	    fprintf(output, "  (RE(FSAC(1,1))-RE(FSAC(2,2)))/RE(FSAC(1,1))=%15.7lE\n", rmbrif);
	    fprintf(output, "  (IM(FSAC(1,1))-IM(FSAC(2,2)))/IM(FSAC(1,1))=%15.7lE\n", rmdchr);
	  }
	  // label 212
	  fprintf(output, "********** JTH =%3d, JPH =%3d, JTHS =%3d, JPHS =%3d ********************\n", jth486, jph484, jths, jphs);
	  fprintf(output, "  TIDG=%10.3lE, PIDG=%10.3lE, TSDG=%10.3lE, PSDG=%10.3lE\n", th, ph, ths, phs);
	  fprintf(output, "  SCAND=%10.3lE\n", cid->scan);
	  fprintf(output, "  CFMP=%15.7lE, SFMP=%15.7lE\n", cid->cfmp, cid->sfmp);
	  fprintf(output, "  CFSP=%15.7lE, SFSP=%15.7lE\n", cid->cfsp, cid->sfsp);
	  if (isam >= 0) {
	    fprintf(output, "  UNI=(%12.5lE,%12.5lE,%12.5lE)\n", cid->un[0], cid->un[1], cid->un[2]);
	    fprintf(output, "  UNS=(%12.5lE,%12.5lE,%12.5lE)\n", cid->uns[0], cid->uns[1], cid->uns[2]);
	  } else { // label 214
	    fprintf(output, "  UN=(%12.5lE,%12.5lE,%12.5lE)\n\n", cid->un[0], cid->un[1], cid->un[2]);
	  }
	  // label 220
	  if (inpol == 0) {
	    fprintf(output, "   LIN\n");
	  } else { // label 222
	    fprintf(output, "  CIRC\n");
	  }
	  // label 224
	  scr2(cid->vk, vkarg, exri, cid->duk, cid->c1, cid->c1ao, cid->c3, cid->c4);
	  if (cid->c4->li != cid->c4->le) fprintf(output, "     SPHERES; LMX=MIN0(LI,LE)\n");
	  for (int i226 = 1; i226 <= nsph; i226++) {
	    if (cid->c1->iog[i226 - 1] >= i226) {
	      fprintf(output, "     SPHERE %2d\n", i226);
	      fprintf(
		      output, "  SAS(1,1)=%15.7lE%15.7lE, SAS(2,1)=%15.7lE%15.7lE\n",
		      real(cid->c1->sas[i226 - 1][0][0]), imag(cid->c1->sas[i226 - 1][0][0]),
		      real(cid->c1->sas[i226 - 1][1][0]), imag(cid->c1->sas[i226 - 1][1][0])
		      );
	      fprintf(
		      output, "  SAS(1,2)=%15.7lE%15.7lE, SAS(2,2)=%15.7lE%15.7lE\n",
		      real(cid->c1->sas[i226 - 1][0][1]), imag(cid->c1->sas[i226 - 1][0][1]),
		      real(cid->c1->sas[i226 - 1][1][1]), imag(cid->c1->sas[i226 - 1][1][1])
		      );
	      for (int j225 = 0; j225 < 16; j225++) {
		cid->c1->vint[j225] = cid->c1->vints[i226 - 1][j225];
	      } // j225 loop
	      mmulc(cid->c1->vint, cid->cmullr, cid->cmul);
	      fprintf(output, "  MULS\n");
	      for (int i1 = 0; i1 < 4; i1++) {
		fprintf(
			output, "        %15.7lE%15.7lE%15.7lE%15.7lE\n",
			cid->cmul[i1][0], cid->cmul[i1][1], cid->cmul[i1][2], cid->cmul[i1][3]
			);
	      } // i1 loop
	      fprintf(output, "  MULSLR\n");
	      for (int i1 = 0; i1 < 4; i1++) {
		fprintf(
			output, "        %15.7lE%15.7lE%15.7lE%15.7lE\n",
			cid->cmullr[i1][0], cid->cmullr[i1][1], cid->cmullr[i1][2], cid->cmullr[i1][3]
			);
	      } // i1 loop
	    }
	  } // i226 loop
	  fprintf(
		  output, "  SAT(1,1)=%15.7lE%15.7lE, SAT(2,1)=%15.7lE%15.7lE\n",
		  real(cid->c3->tsas[0][0]), imag(cid->c3->tsas[0][0]),
		  real(cid->c3->tsas[1][0]), imag(cid->c3->tsas[1][0])
		  );
	  fprintf(
		  output, "  SAT(1,2)=%15.7lE%15.7lE, SAT(2,2)=%15.7lE%15.7lE\n",
		  real(cid->c3->tsas[0][1]), imag(cid->c3->tsas[0][1]),
		  real(cid->c3->tsas[1][1]), imag(cid->c3->tsas[1][1])
		  );
	  fprintf(output, "     CLUSTER\n");
	  pcros(cid->vk, exri, cid->c1, cid->c1ao, cid->c4);
	  mextc(cid->vk, exri, cid->c1ao->fsac, cid->cextlr, cid->cext);
	  mmulc(cid->c1->vint, cid->cmullr, cid->cmul);
	  if (jw != 0) {
	    jw = 0;
	    // Some implicit loops writing to binary.
	    for (int i = 0; i < 4; i++) {
	      for (int j = 0; j < 4; j++) {
		double value = cid->cext[i][j];
		tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      }
	    }
	    for (int i = 0; i < 2; i++) {
	      double value = cid->c1ao->scsc[i];
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      value = real(cid->c1ao->scscp[i]);
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      value = imag(cid->c1ao->scscp[i]);
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      value = cid->c1ao->ecsc[i];
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      value = real(cid->c1ao->ecscp[i]);
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      value = imag(cid->c1ao->ecscp[i]);
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	    }
	    for (int i = 0; i < 3; i++) {
	      for (int j = 0; j < 2; j++) {
		double value = cid->gap[i][j];
		tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
		value = real(cid->gapp[i][j]);
		tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
		value = imag(cid->gapp[i][j]);
		tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      }
	    }
	    for (int i = 0; i < 2; i++) {
	      for (int j = 0; j < 3; j++) {
		double value = cid->tqce[i][j];
		tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
		value = real(cid->tqcpe[i][j]);
		tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
		value = imag(cid->tqcpe[i][j]);
		tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      }
	    }
	    for (int i = 0; i < 2; i++) {
	      for (int j = 0; j < 3; j++) {
		double value = cid->tqcs[i][j];
		tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
		value = real(cid->tqcps[i][j]);
		tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
		value = imag(cid->tqcps[i][j]);
		tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      }
	    }
	    for (int i = 0; i < 3; i++) {
	      double value = cid->u[i];
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      value = cid->up[i];
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      value = cid->un[i];
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	    }
	  }
	  // label 254
	  for (int i = 0; i < 16; i++) {
	    double value = real(cid->c1->vint[i]);
	    tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	    value = imag(cid->c1->vint[i]);
	    tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	  }
	  for (int i = 0; i < 4; i++) {
	    for (int j = 0; j < 4; j++) {
	      double value = cid->cmul[i][j];
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	    }
	  }
	  int jlr = 2;
	  for (int ilr290 = 1; ilr290 <= 2; ilr290++) {
	    int ipol = (ilr290 % 2 == 0) ? 1 : -1;
	    if (ilr290 == 2) jlr = 1;
	    double extsec = cid->c1ao->ecsc[ilr290 - 1];
	    double qext = extsec * cid->sqsfi / cid->c3->gcs;
	    double extrat = extsec / cid->c3->ecs;
	    double scasec = cid->c1ao->scsc[ilr290 - 1];
	    double albedc = scasec / extsec;
	    double qsca = scasec * cid->sqsfi / cid->c3->gcs;
	    double scarat = scasec / cid->c3->scs;
	    double abssec = extsec - scasec;
	    double qabs = abssec * cid->sqsfi / cid->c3->gcs;
	    double absrat = 1.0;
	    double ratio = cid->c3->acs / cid->c3->ecs;
	    if (ratio < -1.0e-6 || ratio > 1.0e-6) absrat = abssec / cid->c3->acs;
	    s0 = cid->c1ao->fsac[ilr290 - 1][ilr290 - 1] * exri;
	    double qschu = imag(s0) * csch;
	    double pschu = real(s0) * csch;
	    s0mag = cabs(s0) * cs0;
	    double refinr = real(cid->c1ao->fsac[ilr290 - 1][ilr290 - 1]) / real(cid->c3->tfsas);
	    double extcor = imag(cid->c1ao->fsac[ilr290 - 1][ilr290 - 1]) / imag(cid->c3->tfsas);
	    if (inpol == 0) {
	      fprintf(output, "   LIN %2d\n", ipol);
	    } else { // label 273
	      fprintf(output, "  CIRC %2d\n", ipol);
	    }
	    // label 275
	    fprintf(output, " ----- SCC ----- ABC ----- EXC ----- ALBEDC --\n");
	    fprintf(
		    output, " %14.7lE%15.7lE%15.7lE%15.7lE\n",
		    scasec, abssec, extsec, albedc
		    );
	    fprintf(output, " --- SCC/TGS - ABC/TGS - EXC/TGS ---\n");
	    fprintf(
		    output, " %14.7lE%15.7lE%15.7lE\n",
		    qsca, qabs, qext
		    );
	    fprintf(output, " ---- SCCRT --- ABCRT --- EXCRT ----\n");
	    fprintf(
		    output, " %14.7lE%15.7lE%15.7lE\n",
		    scarat, absrat, extrat
		    );
	    fprintf(
		    output, "  FSAC(%1d,%1d)=%15.7lE%15.7lE   FSAC(%1d,%1d)=%15.7lE%15.7lE\n",
		    ilr290, ilr290, real(cid->c1ao->fsac[ilr290 - 1][ilr290 - 1]), imag(cid->c1ao->fsac[ilr290 - 1][ilr290 - 1]),
		    jlr, ilr290, real(cid->c1ao->fsac[jlr - 1][ilr290 - 1]), imag(cid->c1ao->fsac[jlr - 1][ilr290 - 1])
		    );
	    fprintf(
		    output, "   SAC(%1d,%1d)=%15.7lE%15.7lE    SAC(%1d,%1d)=%15.7lE%15.7lE\n",
		    ilr290, ilr290, real(cid->c1ao->sac[ilr290 - 1][ilr290 - 1]), imag(cid->c1ao->sac[ilr290 - 1][ilr290 - 1]),
		    jlr, ilr290, real(cid->c1ao->sac[jlr - 1][ilr290 - 1]), imag(cid->c1ao->sac[jlr - 1][ilr290 - 1])
		    );
	    fprintf(
		    output, "  RE(FSAC(%1d,%1d))/RE(TFSAS)=%15.7lE, IM(FSAC(%1d,%1d))/IM(TFSAS)=%15.7lE\n",
		    ilr290, ilr290, refinr, ilr290, ilr290, extcor
		    );
	    fprintf(
		    output, "  QSCHU=%15.7lE, PSCHU=%15.7lE, S0MAG=%15.7lE\n",
		    qschu, pschu, s0mag
		    );
	    bool goto190 = isam >= 0 && (jths > 1 || jphs > 1);
	    if (!goto190) {
	      cid->gapv[0] = cid->gap[0][ilr290 - 1];
	      cid->gapv[1] = cid->gap[1][ilr290 - 1];
	      cid->gapv[2] = cid->gap[2][ilr290 - 1];
	      double extins = cid->c1ao->ecsc[ilr290 - 1];
	      double scatts = cid->c1ao->scsc[ilr290 - 1];
	      double rapr, cosav, fp, fn, fk, fx, fy, fz;
	      rftr(cid->u, cid->up, cid->un, cid->gapv, extins, scatts, rapr, cosav, fp, fn, fk, fx, fy, fz);
	      fprintf(output, "  COSAV=%15.7lE, RAPRS=%15.7lE\n", cosav, rapr);
	      fprintf(output, "  Fl=%15.7lE, Fr=%15.7lE, Fk=%15.7lE\n", fp, fn, fk);
	      fprintf(output, "  Fx=%15.7lE, Fy=%15.7lE, Fz=%15.7lE\n", fx, fy, fz);
	      cid->tqev[0] = cid->tqce[ilr290 - 1][0];
	      cid->tqev[1] = cid->tqce[ilr290 - 1][1];
	      cid->tqev[2] = cid->tqce[ilr290 - 1][2];
	      cid->tqsv[0] = cid->tqcs[ilr290 - 1][0];
	      cid->tqsv[1] = cid->tqcs[ilr290 - 1][1];
	      cid->tqsv[2] = cid->tqcs[ilr290 - 1][2];
	      double tep, ten, tek, tsp, tsn, tsk;
	      tqr(cid->u, cid->up, cid->un, cid->tqev, cid->tqsv, tep, ten, tek, tsp, tsn, tsk);
	      fprintf(output, "   TQEl=%15.7lE,  TQEr=%15.7lE,  TQEk=%15.7lE\n", tep, ten, tek);
	      fprintf(output, "   TQSl=%15.7lE,  TQSr=%15.7lE,  TQSk=%15.7lE\n", tsp, tsn, tsk);
	      fprintf(
		      output, "   TQEx=%15.7lE,  TQEy=%15.7lE,  TQEz=%15.7lE\n",
		      cid->tqce[ilr290 - 1][0], cid->tqce[ilr290 - 1][1], cid->tqce[ilr290 - 1][2]
		      );
	      fprintf(
		      output, "   TQSx=%15.7lE,  TQSy=%15.7lE,  TQSz=%15.7lE\n",
		      cid->tqcs[ilr290 - 1][0], cid->tqcs[ilr290 - 1][1], cid->tqcs[ilr290 - 1][2]
		      );
	    }
	  } //ilr290 loop
	  double rbirif = (real(cid->c1ao->fsac[0][0]) - real(cid->c1ao->fsac[1][1])) / real(cid->c1ao->fsac[0][0]);
	  double rdichr = (imag(cid->c1ao->fsac[0][0]) - imag(cid->c1ao->fsac[1][1])) / imag(cid->c1ao->fsac[0][0]);
	  fprintf(output, "  (RE(FSAC(1,1))-RE(FSAC(2,2)))/RE(FSAC(1,1))=%15.7lE\n", rbirif);
	  fprintf(output, "  (IM(FSAC(1,1))-IM(FSAC(2,2)))/IM(FSAC(1,1))=%15.7lE\n", rdichr);
	  fprintf(output, "  MULC\n");
	  for (int i = 0; i < 4; i++) {
	    fprintf(
		    output, "        %15.7lE%15.7lE%15.7lE%15.7lE\n",
		    cid->cmul[i][0], cid->cmul[i][1], cid->cmul[i][2], cid->cmul[i][3]
		    );
	  }
	  fprintf(output, "  MULCLR\n");
	  for (int i = 0; i < 4; i++) {
	    fprintf(
		    output, "        %15.7lE%15.7lE%15.7lE%15.7lE\n",
		    cid->cmullr[i][0], cid->cmullr[i][1], cid->cmullr[i][2], cid->cmullr[i][3]
		    );
	  }
	  if (iavm != 0) {
	    mmulc(cid->c1ao->vintm, cid->cmullr, cid->cmul);
	    // Some implicit loops writing to binary.
	    for (int i = 0; i < 16; i++) {
	      double value;
	      value = real(cid->c1ao->vintm[i]);
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      value = imag(cid->c1ao->vintm[i]);
	      tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	    }
	    for (int i = 0; i < 4; i++) {
	      for (int j = 0; j < 4; j++) {
		double value = cid->cmul[i][j];
		tppoan.write(reinterpret_cast<char *>(&value), sizeof(double));
	      }
	    }
	    fprintf(output, "     CLUSTER (ENSEMBLE AVERAGE, MODE%2d)\n", iavm);
	    if (inpol == 0) {
	      fprintf(output, "   LIN\n");
	    } else { // label 316
	      fprintf(output, "  CIRC\n");
	    }
	    // label 318
	    fprintf(output, "  MULC\n");
	    for (int i = 0; i < 4; i++) {
	      fprintf(
		      output, "        %15.7lE%15.7lE%15.7lE%15.7lE\n",
		      cid->cmul[i][0], cid->cmul[i][1], cid->cmul[i][2], cid->cmul[i][3]
		      );
	    }
	    fprintf(output, "  MULCLR\n");
	    for (int i = 0; i < 4; i++) {
	      fprintf(
		      output, "        %15.7lE%15.7lE%15.7lE%15.7lE\n",
		      cid->cmullr[i][0], cid->cmullr[i][1], cid->cmullr[i][2], cid->cmullr[i][3]
		      );
	    }
	  }
	  // label 420, continues jphs loop
	  if (isam < 1) phs += sa->phsstp;
	} // jphs loop, labeled 480
	if (isam <= 1) thsl += sa->thsstp;
      } // jths loop, labeled 482
      ph += sa->phstp;
    } // jph484 loop
    th += sa->thstp;
  } // jth486 loop
  nvtxRangePop();
  interval_end = chrono::high_resolution_clock::now();
  elapsed = interval_end - interval_start;
  message = "INFO: angle loop for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
  logger->log(message);
  
  logger->log("INFO: finished scale iteration " + to_string(jxi488) + " of " + to_string(nxi) + ".\n");

  delete logger;

  return jer;
}
