#!/usr/bin/env python # coding: utf-8 # this script will interpolate RAQMS output in space and time to ozonesonde data. # you will need to link in or point to files as noted below. # # if comparing with just one sonde, the output is: # -- a csv file with sonde and model data for each level, as well as model error statistics # if comparing with more than one sonde, five plots and a file are output: # -- time-height and box-and-whisker plots of the sonde data # -- time-height plot of the model data # -- time-height and box-and-whisker plots of the interpolated model data # -- a csv file with period-averaged sonde and model data for each level, as well as model error statistics # *************************************************************************** import pandas as pd # version 2.2.2 import xarray as xr # version 2024.3.0 import numpy as np # version 1.26.4 import glob import os from matplotlib import pyplot as plt # version 3.8.4 import seaborn as sns # version 0.13.2 # libraries for interpolation # note: xesmf warnings while interpolating RAQMS ("Input array is not C_CONTIGUOUS") --can be ignored. import xesmf as xe # version 0.8.5 from scipy import interpolate # version 1.13.0 # date formatting on plots to look nice import matplotlib.dates as mdates import datetime import matplotlib.units as munits import matplotlib.ticker as mticker converter = mdates.ConciseDateConverter()#formats=['%Y', '%b', '%H', '%H', '%H', '%S.%f']) munits.registry[np.datetime64] = converter munits.registry[datetime.date] = converter munits.registry[datetime.datetime] = converter from monetio.models import raqms # version 0.2.6 with melodies-monet 0.1.dev1 import fv3raqms # provided with this script # *************************************************************************** # obs (l100 ozonesonde files) # link or point to only the time period you want (one or more sonde files for that location) sondepath = '/home/mharkey/Mod-Obs' sonde_site = 'bu' # two-letter descriptor for sonde site, will be used to select files and build output file names # RAQMS data # link or point to files within the date range below # specific date/time information: start_time = '2022-05-05 12:00:00' # time of first model timestep, should be one timestep behind sonde data end_time = '2022-05-05 18:00:00' # time of last model timestep, should be one timestep after sonde data modelpath = '/home/mharkey/Mod-Obs' date_desc = 'testone' # date info., will be used to build output file names # *********************************************************************************************************** # *********************************************************************************************************** # ***************************** # function party! def convert_l100(filename,savelocation): """Converts NOAA GSL l100 ozonesonde files to csv. Input: ------ - filename: path to l100 ozonesonde file - savelocation: directory in which to save csv """ import pandas as pd # grab filename information with .1l100 removed fnam_short = filename.split('/')[-1].split('.')[0] # l100 files are fixed width # reader uses column widths determined through playing around # reader also uses header length determined though playing around file = pd.read_fwf(filename,header=23,widths=[6,7,7,8,8,8,8,7,7,7,7,9,8,8]) file.to_csv('{}/{}.csv'.format(savelocation,fnam_short),index=False) return def extract_metadata(file): with open(file) as f: meta_dat = f.read() blocks = meta_dat.replace('\r','').split('\n\n') for block in blocks: if block.startswith('Station'): location_and_time_info = block.split('\n') convert_to_other_type = ['Station Height', 'Latitude','Longitude','Launch Date','Launch Time'] mdat = {} for i in location_and_time_info: if i.startswith('Launch Time'): mdat[i.split(':')[0].strip()] = i.split(':')[1:] else: mdat[i.split(':')[0].strip()] = i.split(':')[1].strip() mdat['Station Height'] = (np.float64(mdat['Station Height'].split()[0]),mdat['Station Height'].split()[1]) mdat['Latitude'] = np.float64(mdat['Latitude']) mdat['Longitude'] = np.float64(mdat['Longitude']) mdat['time'] = pd.to_datetime(mdat['Launch Date'] +' '+ mdat['Launch Time'][0]+':'+mdat['Launch Time'][1]) # extact units and names for columns. mdat['column_units'] = blocks[4].splitlines()[1].split() varnames = blocks[4].splitlines()[0].split()[:-7] # deal with last few names varnames.extend(['o3_nd','o3_res','o3_uncert']) varnames[7:10] =[varnames[i]+'_'+mdat['column_units'][i] for i in range(7,10)] mdat['column_names'] = varnames return mdat def read_single_sonde_file(filename): # extract time and location meta_data = extract_metadata(filename) # extract observations obs_data = pd.read_csv(filename,header=24,delimiter=r'\s+',names=meta_data['column_names']) # associate meta data and observations by combining into xarray obs_ds = obs_data.to_xarray() for i in range(len(meta_data['column_names'])): obs_ds[meta_data['column_names'][i]].attrs['units'] = meta_data['column_units'][i] obs_ds = obs_ds.swap_dims({'index':'Level'}).drop_vars('index') obs_ds['time'] = ('time',[meta_data['time']]) obs_ds['longitude'] = ('x',[meta_data['Longitude']]) obs_ds['latitude'] = ('x',[meta_data['Latitude']]) obs_ds.attrs['station'] = meta_data['Station'] return obs_ds.set_coords(['latitude','longitude','time']) def read_single_site(filelist): nf = len(filelist) for i in range(nf): single_obs = read_single_sonde_file(filelist[i]) if i ==0: all_dataset = single_obs else: all_dataset = xr.concat([all_dataset,single_obs],dim='time') # rename dimension "Level" to 'z' all_dataset = all_dataset.rename_dims({'Level':'z'}) # Apply filter to replace missing data with nan all_dataset = all_dataset.where(all_dataset['Ozone_ppmv'] != 99.999) # Replace nan's in Altitude variable with appropriate standard levels full_altitudes = np.full_like(all_dataset['Alt'],np.linspace(all_dataset['Alt'].min().round(1), all_dataset['Alt'].max().values,all_dataset.sizes['z'])) ## where keeps where the condition is true, replaces with full_altitudes where is is false. all_dataset['Alt'] = all_dataset['Alt'].where(~np.isnan(all_dataset['Alt']),full_altitudes) return all_dataset def pull_full(modelfilelist,sonde_dat,dates,varnames): ''' input ----- - modelfilelist: list of filenames - sonde_dat: xarray dataset - dates: array of datetimes for modelfilelist - varnames: list of variable names ''' import xarray as xr import numpy as np initialize_regrid = True nf = len(dates) # initialize a list to hold interpolated data temp_holder = [] for i in range(nf): if 'tracer' in modelfilelist[i]: raqms_opend = fv3raqms.open_dataset(modelfilelist[i]) elif 'uwhyb' in modelfilelist[i]: raqms_opend = raqms.open_dataset(modelfilelist[i]) if initialize_regrid: # only need to do this once. "nearest_s2d" function set up as a nearest neighbor interpolation rgd = xe.Regridder(raqms_opend[['latitude','longitude']],sonde_dat[['latitude','longitude']],\ 'nearest_s2d',locstream_out=True)#'bilinear',locstream_out=True) initialize_regrid = False temp_holder.append(rgd(raqms_opend[varnames])) # Turn list of interpolated model into an xarray and assign to output dataset model_at_sonde = xr.concat(temp_holder,'time') return model_at_sonde def vertical_interpolate(mod_var,mod_altitude,obs_altitude): from scipy import interpolate import numpy as np out_arr = np.full_like(obs_altitude,np.nan) if (obs_altitude.ndim >=2): for t in range(out_arr.shape[0]): z_mod = mod_altitude[t] z_obs = obs_altitude[t] var_mod = mod_var[t] f = interpolate.interp1d(z_mod,var_mod,fill_value=np.nan,bounds_error=False) out_arr[t] = f(z_obs) else: # to handle one sonde/time z_mod = mod_altitude z_obs = obs_altitude var_mod = mod_var f = interpolate.interp1d(z_mod,var_mod,fill_value=np.nan,bounds_error=False) out_arr = f(z_obs) return out_arr def rmse(res, expected): return np.sqrt(np.mean(((res*1e6) - expected)**2)) if (sonde_site == 'bu'): sonde_site_name = 'Boulder, CO' elif (sonde_site == 'th'): sonde_site_name = 'Trinidad Head, CA' elif (sonde_site == 'hh'): sonde_site_name = 'Hilo, Hawaii' elif (sonde_site == 'sa'): sonde_site_name = 'Pago Pago, American Samoa' # *********************************************************************************************************** # ***************************** # read and plot sonde data obsfiles = glob.glob(os.path.join(sondepath,sonde_site+'*_2022_*.l100')) obsfiles.sort() # find out how many sonde files there are--if there is only one, that will limit plot output. nsondes = len(obsfiles) # reset for naming output files: sonde_site = sonde_site.upper() # trimming out sondes that occur outside of the model time range I'm comparing with #obsfiles = obsfiles[2:] sonde_ds = read_single_site(obsfiles) #print(sonde_ds) sonde_df = sonde_ds.to_dataframe().reset_index() if (nsondes > 1): # plot observation curtain: dumz, mdtime = np.meshgrid(np.ones(sonde_ds.Level.size),sonde_ds.time) plt.pcolormesh(mdtime,sonde_ds['Alt'],sonde_ds['Ozone_ppmv'], vmin=0,vmax=10,cmap='nipy_spectral') plt.colorbar(label='ppmv') plt.title('ESRL/GML Ozonesonde: '+sonde_site_name) plt.ylabel('km') plt.ylim(0,40) plt.savefig(('sonde-'+sonde_site+'_'+date_desc+'-curtain.png'),format='png',dpi=300) plt.clf() # box-and-whisker plot: ax = sns.boxplot(x=sonde_df['Ozone_ppmv'],y=sonde_df['Alt'].round(),color='lightskyblue',linecolor='steelblue',linewidth=.75,orient='h') ax.invert_yaxis() # following https://stackoverflow.com/questions/20337664/cleanest-way-to-hide-every-nth-tick-label-in-matplotlib-colorbar for y axis labels n = 2 # Keeps every nth label [l.set_visible(False) for (i,l) in enumerate(ax.yaxis.get_ticklabels()) if i % n != 0] plt.title('ESRL/GML Ozonesonde: '+sonde_site_name) plt.ylabel('Altitude (km)') plt.xlabel('Ozone (ppmv)') plt.savefig(('sonde-'+sonde_site+'_'+date_desc+'.png'),format='png',dpi=300) plt.clf() # ***************************** # read, interpolote, and plot model data modfiles = glob.glob(os.path.join(modelpath,'uwh*nc')) modfiles.sort() #print(modfiles[0]) if 'tracer' in modfiles[0]: raqmsvars = ['o3vmr','zdash','pdash'] elif 'uwhyb' in modfiles[0]: raqmsvars = ['o3vmr','geop','pres_pa_mid'] # note: 'pdash' is in the output files but this is renamed in monetio/models/raqms.py # "pull_full" opens listed model files, gets nearest neighbor to sonde site, and returns selected variables on model timestep at model altitudes # note that you get "TypeError: object of type 'numpy.float64' has no len()" errors if dates here are wrong model = pull_full(modfiles,sonde_ds,pd.date_range(start_time,end_time,freq='6h'),raqmsvars) if (nsondes > 1): # plot model curtain before time+height interpolation: dumz, mdtime = np.meshgrid(np.ones(model.z.size),model.time) plt.pcolormesh(mdtime,model[raqmsvars[1]].squeeze()/1000.,model['o3vmr'].squeeze()*1e6, vmin=0,vmax=10,cmap='nipy_spectral') plt.colorbar(label='ppmv') plt.title('RAQMS Ozone: '+sonde_site_name) plt.ylabel('km') plt.ylim(0,60) plt.savefig(('RAQMS-'+sonde_site+'_'+date_desc+'-curtain.png'),format='png',dpi=300) plt.clf() # linear interpolation from model timestep to observation time step model_obst = model.interp(time=sonde_ds.time) # linear interpolation from model altitude to observation altitude # note: make sure the altitude variable for the interpolation has the same units for the observation and the sonde. # UFS-RAQMS is in meters while the sonde is in km. May also need to ensure both altitude sets are given relative to ground level or sea level. model_at_sonde_scipyint = vertical_interpolate(model_obst['o3vmr'].values.squeeze(), model_obst[raqmsvars[1]].values.squeeze()/1000.,sonde_ds['Alt'].values) # filter out model where there isn't a valid observation model_at_sonde_scipyint[np.where(np.isnan(sonde_ds['Ozone_ppmv'].values))] = np.nan sonde_df['mod_interp'] = pd.Series(data=model_at_sonde_scipyint.reshape(-1).squeeze()) if (nsondes > 1): # plot model curtain after interpolation: dumz, mdtime = np.meshgrid(np.ones(sonde_ds.z.size),sonde_ds.time) plt.pcolormesh(mdtime,sonde_ds['Alt'].squeeze(),model_at_sonde_scipyint*1e6, vmin=0,vmax=10,cmap='nipy_spectral') plt.colorbar(label='ppmv') plt.title('RAQMS Ozone: Boulder, CO') plt.ylabel('km') plt.ylim(0,40) plt.savefig(('interpRAQMS-'+sonde_site+'_'+date_desc+'-curtain.png'),format='png',dpi=300) plt.clf() # box-and-whisker plot: ax = sns.boxplot(x=sonde_df['mod_interp']*1e6,y=sonde_df['Alt'].round(),color='peachpuff',linecolor='coral',linewidth=.75,orient='h') ax.invert_yaxis() n = 2 # Keeps every nth label [l.set_visible(False) for (i,l) in enumerate(ax.yaxis.get_ticklabels()) if i % n != 0] plt.title('Boulder, CO') plt.ylabel('Altitude (km)') plt.xlabel('Ozone (ppmv)') plt.savefig(('interpRAQMS-'+sonde_site+'_'+date_desc+'.png'),format='png',dpi=300) plt.clf() # ***************************** # add some stats calculations: # mean error (MEz = mean absolute error), mean bias (MBz), and RMSEz as a function of altitude # this needs to be updated for one sonde MEz = np.empty(sonde_ds.Level.size) MEz[:] = np.nan MBz = np.empty(sonde_ds.Level.size) MBz[:] = np.nan RMSEz = np.empty(sonde_ds.Level.size) RMSEz[:] = np.nan if (nsondes > 1): for l in range(0,sonde_ds.Level.size): MEz[l] = np.sum(np.abs((model_at_sonde_scipyint[:,l]*1e6)-sonde_ds['Ozone_ppmv'][:,l])) / sonde_ds.time.size MBz[l] = np.sum((model_at_sonde_scipyint[:,l]*1e6)-sonde_ds['Ozone_ppmv'][:,l]) / sonde_ds.time.size RMSEz[l] = rmse(model_at_sonde_scipyint[:,l], sonde_ds['Ozone_ppmv'][:,l]) # plot MEz, MBz, and RMSEz: plt.title(sonde_site+', '+date_desc) plt.xlabel('ppmv') plt.ylabel('Altitude (km)') plt.plot(MEz, sonde_ds['Alt'][0,:].round(), color='red', label='Mean Absolute Error', linestyle='--', linewidth=1.5) plt.plot(MBz, sonde_ds['Alt'][0,:].round(), color='black', label='Mean Bias') plt.plot(RMSEz, sonde_ds['Alt'][0,:].round(), color='blue', label='RMSE') plt.savefig(('interpRAQMS-'+sonde_site+'_'+date_desc+'_stats.png'), format='png', dpi=300) stats_df = pd.DataFrame({'Sonde (ppmv)':sonde_ds['Ozone_ppmv'].mean(dim='time').values, 'RAQMS (ppmv)': (sonde_df.groupby('z')['mod_interp'].mean().values*1e6), 'Altitude (km)':sonde_ds['Alt'][0,:], 'Mean Error':MEz, 'Mean Bias':MBz, 'RMSE':RMSEz}) else: for l in range(0,sonde_ds.Level.size): # IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed MEz[l] = np.sum(np.abs((model_at_sonde_scipyint[l]*1e6)-sonde_ds['Ozone_ppmv'][l])) # (divide by 1 time) MBz[l] = np.sum((model_at_sonde_scipyint[l]*1e6)-sonde_ds['Ozone_ppmv'][l]) # (divide by 1 time) RMSEz[l] = rmse(model_at_sonde_scipyint[l], sonde_ds['Ozone_ppmv'][l]) stats_df = pd.DataFrame({'Sonde (ppmv)':sonde_ds['Ozone_ppmv'].values, 'RAQMS (ppmv)': (sonde_df['mod_interp'].values*1e6), 'Altitude (km)':sonde_ds['Alt'][:], 'Mean Error':MEz, 'Mean Bias':MBz, 'RMSE':RMSEz}) # output stats to csv: stats_df.to_csv('interpRAQMS-'+sonde_site+'_'+date_desc+'_stats.csv')