#!/usr/bin/env python
# coding: utf-8

# this script will interpolate RAQMS output in space and time to ozonesonde data.
# you will need to link in or point to files as noted below.
#
# if comparing with just one sonde, the output is:
# -- a csv file with sonde and model data for each level, as well as model error statistics
# if comparing with more than one sonde, five plots and a file are output:
# -- time-height and box-and-whisker plots of the sonde data
# -- time-height plot of the model data
# -- time-height and box-and-whisker plots of the interpolated model data
# -- a csv file with period-averaged sonde and model data for each level, as well as model error statistics

# ***************************************************************************
import pandas as pd                             # version 2.2.2 
import xarray as xr                             # version 2024.3.0
import numpy as np                              # version 1.26.4 
import glob
import os
from matplotlib import pyplot as plt            # version 3.8.4
import seaborn as sns                           # version 0.13.2 

# libraries for interpolation
# note: xesmf warnings while interpolating RAQMS ("Input array is not C_CONTIGUOUS") --can be ignored.
import xesmf as xe                              # version 0.8.5
from scipy import interpolate                   # version 1.13.0 

# date formatting on plots to look nice
import matplotlib.dates as mdates
import datetime
import matplotlib.units as munits
import matplotlib.ticker as mticker
converter = mdates.ConciseDateConverter()#formats=['%Y', '%b', '%H', '%H', '%H', '%S.%f'])
munits.registry[np.datetime64] = converter
munits.registry[datetime.date] = converter
munits.registry[datetime.datetime] = converter

from monetio.models import raqms                # version 0.2.6 with melodies-monet 0.1.dev1
import fv3raqms                                 # provided with this script

# ***************************************************************************
# obs (l100 ozonesonde files) 
# link or point to only the time period you want (one or more sonde files for that location)
sondepath = '/home/mharkey/Mod-Obs' 
sonde_site = 'bu' # two-letter descriptor for sonde site, will be used to select files and build output file names

# RAQMS data
# link or point to files within the date range below
# specific date/time information:
start_time = '2022-05-05 12:00:00' # time of first model timestep, should be one timestep behind sonde data
end_time = '2022-05-05 18:00:00' # time of last model timestep, should be one timestep after sonde data
modelpath = '/home/mharkey/Mod-Obs' 

date_desc = 'testone' # date info., will be used to build output file names

# ***********************************************************************************************************
# ***********************************************************************************************************
# *****************************
# function party!

def convert_l100(filename,savelocation):
    """Converts NOAA GSL l100 ozonesonde files to csv.
        Input:
        ------
        - filename: path to l100 ozonesonde file
        - savelocation: directory in which to save csv
    """
    import pandas as pd
    # grab filename information with .1l100 removed 
    fnam_short = filename.split('/')[-1].split('.')[0]
    # l100 files are fixed width
    # reader uses column widths determined through playing around
    # reader also uses header length determined though playing around
    file = pd.read_fwf(filename,header=23,widths=[6,7,7,8,8,8,8,7,7,7,7,9,8,8])
    file.to_csv('{}/{}.csv'.format(savelocation,fnam_short),index=False)
    return

def extract_metadata(file):
    with open(file) as f:
        meta_dat = f.read()
    blocks = meta_dat.replace('\r','').split('\n\n')    
    for block in blocks:
        if block.startswith('Station'):
            location_and_time_info = block.split('\n')
    convert_to_other_type = ['Station Height', 'Latitude','Longitude','Launch Date','Launch Time']
    mdat = {}
    for i in location_and_time_info:
        if i.startswith('Launch Time'):
            mdat[i.split(':')[0].strip()] = i.split(':')[1:]
        else:
            mdat[i.split(':')[0].strip()] = i.split(':')[1].strip()
    mdat['Station Height'] = (np.float64(mdat['Station Height'].split()[0]),mdat['Station Height'].split()[1])
    mdat['Latitude'] = np.float64(mdat['Latitude'])
    mdat['Longitude'] = np.float64(mdat['Longitude'])
    mdat['time'] = pd.to_datetime(mdat['Launch Date'] +' '+ mdat['Launch Time'][0]+':'+mdat['Launch Time'][1])
    # extact units and names for columns. 
    mdat['column_units'] = blocks[4].splitlines()[1].split()
    varnames = blocks[4].splitlines()[0].split()[:-7] # deal with last few names
    varnames.extend(['o3_nd','o3_res','o3_uncert']) 
    varnames[7:10] =[varnames[i]+'_'+mdat['column_units'][i] for i in range(7,10)]
    mdat['column_names'] = varnames
    return mdat

def read_single_sonde_file(filename):
    # extract time and location
    meta_data = extract_metadata(filename)
    # extract observations
    obs_data = pd.read_csv(filename,header=24,delimiter=r'\s+',names=meta_data['column_names'])
    # associate meta data and observations by combining into xarray
    obs_ds = obs_data.to_xarray()
    for i in range(len(meta_data['column_names'])):
        obs_ds[meta_data['column_names'][i]].attrs['units'] = meta_data['column_units'][i]
    obs_ds = obs_ds.swap_dims({'index':'Level'}).drop_vars('index')
    obs_ds['time'] = ('time',[meta_data['time']])
    obs_ds['longitude'] = ('x',[meta_data['Longitude']])
    obs_ds['latitude'] = ('x',[meta_data['Latitude']])
    obs_ds.attrs['station'] = meta_data['Station']
    return obs_ds.set_coords(['latitude','longitude','time'])

def read_single_site(filelist):
    nf = len(filelist)
    for i in range(nf):
        single_obs = read_single_sonde_file(filelist[i])
        if i ==0: 
            all_dataset = single_obs
        else:
            all_dataset = xr.concat([all_dataset,single_obs],dim='time')
    # rename dimension "Level" to 'z'
    all_dataset = all_dataset.rename_dims({'Level':'z'})
    # Apply filter to replace missing data with nan
    all_dataset = all_dataset.where(all_dataset['Ozone_ppmv'] != 99.999)
    # Replace nan's in Altitude variable with appropriate standard levels
    full_altitudes = np.full_like(all_dataset['Alt'],np.linspace(all_dataset['Alt'].min().round(1),
                                                                 all_dataset['Alt'].max().values,all_dataset.sizes['z']))
    ## where keeps where the condition is true, replaces with full_altitudes where is is false.
    all_dataset['Alt'] = all_dataset['Alt'].where(~np.isnan(all_dataset['Alt']),full_altitudes)
    return all_dataset

def pull_full(modelfilelist,sonde_dat,dates,varnames):
    '''
    input
    -----
    - modelfilelist: list of filenames
    - sonde_dat: xarray dataset
    - dates: array of datetimes for modelfilelist
    - varnames: list of variable names 
    '''
    import xarray as xr
    import numpy as np
    initialize_regrid = True
    nf = len(dates)
    # initialize a list to hold interpolated data
    temp_holder = []
    for i in range(nf):
        if 'tracer' in modelfilelist[i]:
            raqms_opend = fv3raqms.open_dataset(modelfilelist[i])
        elif 'uwhyb' in modelfilelist[i]:
            raqms_opend = raqms.open_dataset(modelfilelist[i])
        if initialize_regrid: # only need to do this once. "nearest_s2d" function set up as a nearest neighbor interpolation 
            rgd = xe.Regridder(raqms_opend[['latitude','longitude']],sonde_dat[['latitude','longitude']],\
                              'nearest_s2d',locstream_out=True)#'bilinear',locstream_out=True)
            initialize_regrid = False
        temp_holder.append(rgd(raqms_opend[varnames]))
    # Turn list of interpolated model into an xarray and assign to output dataset 
    model_at_sonde = xr.concat(temp_holder,'time')
    return model_at_sonde

def vertical_interpolate(mod_var,mod_altitude,obs_altitude):
    from scipy import interpolate
    import numpy as np
    out_arr = np.full_like(obs_altitude,np.nan)
    if (obs_altitude.ndim >=2):
       for t in range(out_arr.shape[0]):
        z_mod = mod_altitude[t]
        z_obs = obs_altitude[t]
        var_mod = mod_var[t]
        f = interpolate.interp1d(z_mod,var_mod,fill_value=np.nan,bounds_error=False)
        out_arr[t] = f(z_obs)
    else: # to handle one sonde/time
        z_mod = mod_altitude
        z_obs = obs_altitude
        var_mod = mod_var
        f = interpolate.interp1d(z_mod,var_mod,fill_value=np.nan,bounds_error=False)
        out_arr = f(z_obs)
    return out_arr

def rmse(res, expected):
    return np.sqrt(np.mean(((res*1e6) - expected)**2))

if (sonde_site == 'bu'):
    sonde_site_name = 'Boulder, CO'
elif (sonde_site == 'th'):
    sonde_site_name = 'Trinidad Head, CA'
elif (sonde_site == 'hh'):
    sonde_site_name = 'Hilo, Hawaii'
elif (sonde_site == 'sa'):
    sonde_site_name = 'Pago Pago, American Samoa'

# ***********************************************************************************************************
# *****************************
# read and plot sonde data

obsfiles = glob.glob(os.path.join(sondepath,sonde_site+'*_2022_*.l100'))
obsfiles.sort()
# find out how many sonde files there are--if there is only one, that will limit plot output.
nsondes = len(obsfiles)
# reset for naming output files:
sonde_site = sonde_site.upper()
# trimming out sondes that occur outside of the model time range I'm comparing with
#obsfiles = obsfiles[2:]
sonde_ds = read_single_site(obsfiles)
#print(sonde_ds)
sonde_df = sonde_ds.to_dataframe().reset_index()

if (nsondes > 1):
   # plot observation curtain:
   dumz, mdtime = np.meshgrid(np.ones(sonde_ds.Level.size),sonde_ds.time)
   plt.pcolormesh(mdtime,sonde_ds['Alt'],sonde_ds['Ozone_ppmv'],
                  vmin=0,vmax=10,cmap='nipy_spectral')
   plt.colorbar(label='ppmv')
   plt.title('ESRL/GML Ozonesonde: '+sonde_site_name)
   plt.ylabel('km')
   plt.ylim(0,40)
   plt.savefig(('sonde-'+sonde_site+'_'+date_desc+'-curtain.png'),format='png',dpi=300)
   plt.clf()

   # box-and-whisker plot:
   ax = sns.boxplot(x=sonde_df['Ozone_ppmv'],y=sonde_df['Alt'].round(),color='lightskyblue',linecolor='steelblue',linewidth=.75,orient='h')
   ax.invert_yaxis()
   # following https://stackoverflow.com/questions/20337664/cleanest-way-to-hide-every-nth-tick-label-in-matplotlib-colorbar for y axis labels
   n = 2  # Keeps every nth label
   [l.set_visible(False) for (i,l) in enumerate(ax.yaxis.get_ticklabels()) if i % n != 0]
   plt.title('ESRL/GML Ozonesonde: '+sonde_site_name)
   plt.ylabel('Altitude (km)')
   plt.xlabel('Ozone (ppmv)')
   plt.savefig(('sonde-'+sonde_site+'_'+date_desc+'.png'),format='png',dpi=300)
   plt.clf()


# *****************************
# read, interpolote, and plot model data

modfiles = glob.glob(os.path.join(modelpath,'uwh*nc'))
modfiles.sort()
#print(modfiles[0])
if 'tracer' in modfiles[0]:
    raqmsvars = ['o3vmr','zdash','pdash']
elif 'uwhyb' in modfiles[0]:
    raqmsvars = ['o3vmr','geop','pres_pa_mid'] # note: 'pdash' is in the output files but this is renamed in monetio/models/raqms.py
# "pull_full" opens listed model files, gets nearest neighbor to sonde site, and returns selected variables on model timestep at model altitudes
# note that you get "TypeError: object of type 'numpy.float64' has no len()" errors if dates here are wrong
model = pull_full(modfiles,sonde_ds,pd.date_range(start_time,end_time,freq='6h'),raqmsvars)

if (nsondes > 1):
   # plot model curtain before time+height interpolation:
   dumz, mdtime = np.meshgrid(np.ones(model.z.size),model.time)
   plt.pcolormesh(mdtime,model[raqmsvars[1]].squeeze()/1000.,model['o3vmr'].squeeze()*1e6,
                  vmin=0,vmax=10,cmap='nipy_spectral')
   plt.colorbar(label='ppmv')
   plt.title('RAQMS Ozone: '+sonde_site_name)
   plt.ylabel('km')
   plt.ylim(0,60)
   plt.savefig(('RAQMS-'+sonde_site+'_'+date_desc+'-curtain.png'),format='png',dpi=300)
   plt.clf()

# linear interpolation from model timestep to observation time step
model_obst = model.interp(time=sonde_ds.time)
# linear interpolation from model altitude to observation altitude
# note: make sure the altitude variable for the interpolation has the same units for the observation and the sonde. 
# UFS-RAQMS is in meters while the sonde is in km. May also need to ensure both altitude sets are given relative to ground level or sea level. 
model_at_sonde_scipyint = vertical_interpolate(model_obst['o3vmr'].values.squeeze(),
                                               model_obst[raqmsvars[1]].values.squeeze()/1000.,sonde_ds['Alt'].values)
# filter out model where there isn't a valid observation
model_at_sonde_scipyint[np.where(np.isnan(sonde_ds['Ozone_ppmv'].values))] = np.nan
sonde_df['mod_interp'] = pd.Series(data=model_at_sonde_scipyint.reshape(-1).squeeze())

if (nsondes > 1):
   # plot model curtain after interpolation:
   dumz, mdtime = np.meshgrid(np.ones(sonde_ds.z.size),sonde_ds.time)
   plt.pcolormesh(mdtime,sonde_ds['Alt'].squeeze(),model_at_sonde_scipyint*1e6,
                  vmin=0,vmax=10,cmap='nipy_spectral')
   plt.colorbar(label='ppmv')
   plt.title('RAQMS Ozone: Boulder, CO')
   plt.ylabel('km')
   plt.ylim(0,40)
   plt.savefig(('interpRAQMS-'+sonde_site+'_'+date_desc+'-curtain.png'),format='png',dpi=300)
   plt.clf()

   # box-and-whisker plot:
   ax = sns.boxplot(x=sonde_df['mod_interp']*1e6,y=sonde_df['Alt'].round(),color='peachpuff',linecolor='coral',linewidth=.75,orient='h')
   ax.invert_yaxis()
   n = 2  # Keeps every nth label
   [l.set_visible(False) for (i,l) in enumerate(ax.yaxis.get_ticklabels()) if i % n != 0]
   plt.title('Boulder, CO')
   plt.ylabel('Altitude (km)')
   plt.xlabel('Ozone (ppmv)')
   plt.savefig(('interpRAQMS-'+sonde_site+'_'+date_desc+'.png'),format='png',dpi=300)
   plt.clf()


# *****************************
# add some stats calculations: 
# mean error (MEz = mean absolute error), mean bias (MBz), and RMSEz as a function of altitude 

# this needs to be updated for one sonde
MEz = np.empty(sonde_ds.Level.size)
MEz[:] = np.nan
MBz = np.empty(sonde_ds.Level.size)
MBz[:] = np.nan
RMSEz = np.empty(sonde_ds.Level.size)
RMSEz[:] = np.nan

if (nsondes > 1):
   for l in range(0,sonde_ds.Level.size):
       MEz[l] = np.sum(np.abs((model_at_sonde_scipyint[:,l]*1e6)-sonde_ds['Ozone_ppmv'][:,l])) / sonde_ds.time.size
       MBz[l] = np.sum((model_at_sonde_scipyint[:,l]*1e6)-sonde_ds['Ozone_ppmv'][:,l]) / sonde_ds.time.size
       RMSEz[l] = rmse(model_at_sonde_scipyint[:,l], sonde_ds['Ozone_ppmv'][:,l])
    
   # plot MEz, MBz, and RMSEz:
   plt.title(sonde_site+', '+date_desc) 
   plt.xlabel('ppmv') 
   plt.ylabel('Altitude (km)') 
   plt.plot(MEz, sonde_ds['Alt'][0,:].round(), color='red', label='Mean Absolute Error', linestyle='--', linewidth=1.5)
   plt.plot(MBz, sonde_ds['Alt'][0,:].round(), color='black', label='Mean Bias')
   plt.plot(RMSEz, sonde_ds['Alt'][0,:].round(), color='blue', label='RMSE') 
   plt.savefig(('interpRAQMS-'+sonde_site+'_'+date_desc+'_stats.png'), format='png', dpi=300)

   stats_df = pd.DataFrame({'Sonde (ppmv)':sonde_ds['Ozone_ppmv'].mean(dim='time').values, 'RAQMS (ppmv)': (sonde_df.groupby('z')['mod_interp'].mean().values*1e6), 'Altitude (km)':sonde_ds['Alt'][0,:], 'Mean Error':MEz, 'Mean Bias':MBz, 'RMSE':RMSEz})
else:
   for l in range(0,sonde_ds.Level.size):
# IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed
      MEz[l] = np.sum(np.abs((model_at_sonde_scipyint[l]*1e6)-sonde_ds['Ozone_ppmv'][l])) # (divide by 1 time)
      MBz[l] = np.sum((model_at_sonde_scipyint[l]*1e6)-sonde_ds['Ozone_ppmv'][l]) # (divide by 1 time)
      RMSEz[l] = rmse(model_at_sonde_scipyint[l], sonde_ds['Ozone_ppmv'][l])

   stats_df = pd.DataFrame({'Sonde (ppmv)':sonde_ds['Ozone_ppmv'].values, 'RAQMS (ppmv)': (sonde_df['mod_interp'].values*1e6), 'Altitude (km)':sonde_ds['Alt'][:], 'Mean Error':MEz, 'Mean Bias':MBz, 'RMSE':RMSEz})

# output stats to csv:
stats_df.to_csv('interpRAQMS-'+sonde_site+'_'+date_desc+'_stats.csv')