default.config

########################################################################################################################
# This is the configuration file for the MOS-X model. Parameters specified here are used to build and run the
# machine learning model to predict a few weather parameters at a given station location.
########################################################################################################################
# Global parameters go here.

# The root directory of this program
MOSX_ROOT =

# The root directory of the BUFRgruven executable program
BUFR_ROOT =

# The directory where site-specific data files are saved (defaults to %MOSX_ROOT/site_data)
SITE_ROOT =

# The 4-letter station ID of the location
station_id = KAUS

# Lowest pressure level for vertical profiles. For stations at high elevation, the lowest pressure level may have to be
# higher because data will be missing at higher pressures.
lowest_p_level = 950

# Starting and ending dates for the model training. BUFR data are available beginning 2010-01-01. If the parameter
# 'is_season' is True, then the program will assume that you want seasonally-subset training data beginning in the year
# of 'data_start_date' and ending in the year of 'data_end_date', with seasons spanning from the start day to the end
# day. Dates should be provided as YYYYMMDD.
data_start_date = 20081120
data_end_date = 20160409
is_season = True

# This is the UTC hour at which the forecast period starts on any given day. The period ends 24 hours later. (Defaults
# to 6.)
forecast_hour_start = 6

# Provide here the hourly resolution of the time series prediction. Should be an integer in the range of 1 to 6, but not
# 5... Ignored if the parameter 'predict_timeseries' under 'Model' is False. (Defaults to 3.)
time_series_interval = 3

# API token for MesoWest data (required). To get an API key, visit https://synopticlabs.org/api/
meso_token =

# Produce verbose output
verbose = True


########################################################################################################################
# In this section, provide parameters used to control the retrieval of BUFR profile model data.

[BUFR]

    # (Optional) The station ID used in BUFR for the station (defaults to station_id, may be different though)
    bufr_station_id =

    # (Optional) Data directory where BUFKIT files are saved (defaults to %SITE_ROOT/bufkit)
    bufr_data_dir =

    # (Optional) Path to bufrgruven executable (defaults to %BUFR_ROOT/bufr_gruven.pl)
    bufrgruven =

    # BUFR models; should be a list
    models = GFS, NAM


########################################################################################################################
# In this section, provide parameters to control the retrieval of observations.

[Obs]

    # Option to use upper-air sounding data
    use_soundings = False

    # Upper-air sounding station ID, required if use_soundings is True
    sounding_station_id = FWD

    # (Optional) Data directory where sounding files are saved (defaults to %SITE_ROOT/soundings)
    sounding_data_dir =

    # Set this option to False to disable retrieval of NCDC and CF6 data for max wind values. It may need to be
    # disabled if the forecast start hour is not near 6 UTC, otherwise should be True. (Defaults to True.)
    use_climo_wind = True


########################################################################################################################
# In this section, provide the machine learning model parameters.

[Model]

    # Save the model estimator using pickle to this file
    estimator_file = %(SITE_ROOT)s/%(station_id)s_mosx.pkl

    # The base scikit-learn regressor
    regressor = ensemble.RandomForestRegressor

    # If True, trains a separate estimator for each weather parameter
    train_individual = True

    # If True, also predict a meteorology time series for the next day, in addition to high/low/wind/rain
    predict_timeseries = False

    # This is the type of rain forecast. 'quantity' predicts an actual amount of rain in inches, 'categorical'
    # predicts a probabilistic category of rain (a la MOS), and 'pop' (probability of precipitation) predicts the
    # fractional chance that there will be ANY measurable precipitation.
    rain_forecast_type = pop

    # Keyword arguments passed to the base regressor
    [[Parameters]]
        n_estimators = 1000
        max_features = 0.75
        n_jobs = 2
        verbose = 1

    # This section, if present, enables a post-processing algorithm to be trained on the raw rain predictions from a
    # native ensemble regressor. This is usually desirable because rain has a very non-normal distribution and is
    # therefore tricky for standard algorithms to predict. The parameter 'rain_estimator' is a string, just like
    # 'regressor' above, which determines the scikit-learn algorithm to use (classifiers are an option!). The parameter
    # 'use_raw_rain' determines whether the raw rainfall estimates from the BUFR models are also used as features for
    # the rain post-processor, in addition to the model ensemble statistics. Any other parameters provided here are
    # passed as kwargs to the initialization of the processor's scikit-learn algorithm. Due to the use of certain
    # methods used in Bootstrapping, use_raw_rain is currently not compatible with Bootstrapping.
    # [[Rain tuning]]
    #     rain_estimator = ensemble.RandomForestRegressor
    #     use_raw_rain = False
    #     n_estimators = 100

    # Ada boosting may improve the performance of a model by selectively increasing the weight of training on samples
    # with a large training error. If Ada boosting is desired, provide here any parameters passed to the Ada class;
    # otherwise, remove or comment out this subsection.
    # [[Ada boosting]]
    #     n_estimators = 50

    # This last option allows for bootstrapping development of an ensemble of ML models. The training set is split
    # according to a few options here, then an ensemble of n_members is generated by training on individual splits.
    # Comment out this section to disable bootstrapping.
    # [[Bootstrapping]]
    #     n_members = 10
    #     # The number of training samples per split (if int), or the fraction (if float)
    #     n_samples_split = 0.1
    #     # If 1, each split contains no sample present in any other split. Also overrides n_samples_split and sets it
    #     # as the maximum available per split. Otherwise, set to 0.
    #     unique_splits = 0


########################################################################################################################
# There are a few parameters here for validation of the model.

[Validate]

    # Start and end dates for the validation
    start_date = 20161120
    end_date = 20170409


########################################################################################################################
# The run executable allows for an upload to an FTP or SFTP server, for example, to post data to a website. The forecast
# data are aggregated over all runs and uploaded as a file_type (currently only 'pickle' is implemented), and if a plot
# of forecast ensemble distributions is requested, that plot is also uploaded.

[Upload]

    # Type of file to upload forecasts in. Can be 'pickle' (one file for all forecasts), 'json' (one file per forecast
    # date), 'uw_text' (one file per forecast date, only a short version with high/pop), or a list of several.
    file_type = json

    # User name, server, and directory on server. Prompts for password, or set an ssh key. The forecast_directory
    # points to where the forecasts are uploaded, while the plot_directory points to where plots are uploaded. If
    # 'username' and 'server' are both empty, then assumes local directories are specified.
    username =
    server =
    forecast_directory =
    plot_directory =