pvlib-python/pvlib/iotools/srml.py at 42e767baf2ca3fecf70501e723d6cec8223e29c0 · AdamRJensen/pvlib-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
"""Collection of functions to operate on data from University of Oregon Solar
Radiation Monitoring Laboratory (SRML) data.
"""
import numpy as np
import pandas as pd
import urllib
import warnings

# VARIABLE_MAP is a dictionary mapping SRML data element numbers to their
# pvlib names. For most variables, only the first three digits are used,
# the fourth indicating the instrument. Spectral data (7xxx) uses all
# four digits to indicate the variable. See a full list of data element
# numbers `here. <http://solardata.uoregon.edu/DataElementNumbers.html>`_

VARIABLE_MAP = {
    '100': 'ghi',
    '201': 'dni',
    '300': 'dhi',
    '920': 'wind_direction',
    '921': 'wind_speed',
    '930': 'temp_air',
    '931': 'temp_dew',
    '933': 'relative_humidity',
    '937': 'temp_cell',
}


def read_srml(filename, map_variables=True):
    """
    Read University of Oregon SRML 1min .tsv file into pandas dataframe.

    The SRML is described in [1]_.

    Parameters
    ----------
    filename: str
        filepath or url to read for the tsv file.
    map_variables: bool, default: True
        When true, renames columns of the DataFrame to pvlib variable names
        where applicable. See variable :const:`VARIABLE_MAP`.

    Returns
    -------
    data: Dataframe
        A dataframe with datetime index

    Notes
    -----
    The time index is shifted back by one interval to account for the
    daily endtime of 2400, and to avoid time parsing errors on leap
    years. The returned data values are labeled by the left endpoint of
    interval, and should be understood to occur during the interval from
    the time of the row until the time of the next row. This is consistent
    with pandas' default labeling behavior.

    See [2]_ for more information concerning the file format.

    References
    ----------
    .. [1] University of Oregon Solar Radiation Monitoring Laboratory
       http://solardata.uoregon.edu/
    .. [2] `Archival (short interval) data files
       <http://solardata.uoregon.edu/ArchivalFiles.html>`_
    """
    tsv_data = pd.read_csv(filename, delimiter='\t')
    data = _format_index(tsv_data)
    # Drop day of year and time columns
    data = data[data.columns[2:]]

    if map_variables:
        data = data.rename(columns=_map_columns)

    # Quality flag columns are all labeled 0 in the original data. They
    # appear immediately after their associated variable and are suffixed
    # with an integer value when read from the file. So we map flags to
    # the preceding variable with a '_flag' suffix.
    #
    # Example:
    #   Columns ['ghi_0', '0.1', 'temp_air_2', '0.2']
    #
    #   Yields a flag_label_map of:
    #       { '0.1': 'ghi_0_flag',
    #         '0.2': 'temp_air_2'}
    #
    columns = data.columns
    flag_label_map = {flag: columns[columns.get_loc(flag) - 1] + '_flag'
                      for flag in columns[1::2]}
    data = data.rename(columns=flag_label_map)

    # Mask data marked with quality flag 99 (bad or missing data)
    for col in columns[::2]:
        missing = data[col + '_flag'] == 99
        data[col] = data[col].where(~(missing), np.nan)
    return data


def _map_columns(col):
    """Map data element numbers to pvlib names.

    Parameters
    ----------
    col: str
        Column label to be mapped.

    Returns
    -------
    str
        The pvlib label if it was found in the mapping,
        else the original label.
    """
    if col.startswith('7'):
        # spectral data
        try:
            return VARIABLE_MAP[col]
        except KeyError:
            return col
    try:
        variable_name = VARIABLE_MAP[col[:3]]
        variable_number = col[3:]
        return variable_name + '_' + variable_number
    except KeyError:
        return col


def _format_index(df):
    """Create a datetime index from day of year, and time columns.

    Parameters
    ----------
    df: pd.Dataframe
        The srml data to reindex.

    Returns
    -------
    df: pd.Dataframe
        The Dataframe with a DatetimeIndex localized to 'Etc/GMT+8'.
    """
    # Name of the second column indicates the year of the file, but
    # the column contains times.
    year = int(df.columns[1])
    df_doy = df[df.columns[0]]
    # Times are expressed as integers from 1-2400, we convert to 0-2359 by
    # subracting the length of one interval and then correcting the times
    # at each former hour. interval_length is determined by taking the
    # difference of the first two rows of the time column.
    # e.g. The first two rows of hourly data are 100 and 200
    #      so interval_length is 100.
    interval_length = df[df.columns[1]][1] - df[df.columns[1]][0]
    df_time = df[df.columns[1]] - interval_length
    if interval_length == 100:
        # Hourly files do not require fixing the former hour timestamps.
        times = df_time
    else:
        # Because hours are represented by some multiple of 100, shifting
        # results in invalid values.
        #
        # e.g. 200 (for 02:00) shifted by 15 minutes becomes 185, the
        #      desired result is 145 (for 01:45)
        #
        # So we find all times with minutes greater than 60 and remove 40
        # to correct to valid times.
        old_hours = df_time % 100 > 60
        times = df_time.where(~old_hours, df_time - 40)
    times = times.apply(lambda x: '{:04.0f}'.format(x))
    doy = df_doy.apply(lambda x: '{:03.0f}'.format(x))
    dts = pd.to_datetime(str(year) + '-' + doy + '-' + times,
                         format='%Y-%j-%H%M')
    df.index = dts
    df = df.tz_localize('Etc/GMT+8')
    return df


def get_srml(station, start, end, filetype='PO', map_variables=True,
             url="http://solardata.uoregon.edu/download/Archive/"):
    """Request data from UoO SRML and read it into a Dataframe.

    The University of Oregon Solar Radiation Monitoring Laboratory (SRML) is
    described in [1]_. A list of stations can be found in [2]_.

    Data is returned for the entire months between and including start and end.

    Parameters
    ----------
    station : str
        Two letter station abbreviation.
    start : datetime-like
        First day of the requested period
    end : datetime-like
        Last day of the requested period
    filetype : string, default: 'PO'
        SRML file type to gather. See notes for explanation.
    map_variables : bool, default: True
        When true, renames columns of the DataFrame to pvlib variable names
        where applicable. See variable :const:`VARIABLE_MAP`.
    url : str, default: 'http://solardata.uoregon.edu/download/Archive/'
        API endpoint URL

    Returns
    -------
    data : pd.DataFrame
        Dataframe with data from SRML.
    meta : dict
        Metadata.

    Notes
    -----
    File types designate the time interval of a file and if it contains
    raw or processed data. For instance, `RO` designates raw, one minute
    data and `PO` designates processed one minute data. The availability
    of file types varies between sites. Below is a table of file types
    and their time intervals. See [1] for site information.

    ============= ============ ==================
    time interval raw filetype processed filetype
    ============= ============ ==================
    1 minute      RO           PO
    5 minute      RF           PF
    15 minute     RQ           PQ
    hourly        RH           PH
    ============= ============ ==================

    Warning
    -------
    SRML data has nighttime data prefilled with 0s through the end of the
    current month (i.e., values are provided for data in the future).

    References
    ----------
    .. [1] University of Oregon Solar Radiation Measurement Laboratory
       http://solardata.uoregon.edu/
    .. [2] Station ID codes - Solar Radiation Measurement Laboratory
       http://solardata.uoregon.edu/StationIDCodes.html
    """
    # Use pd.to_datetime so that strings (e.g. '2021-01-01') are accepted
    start = pd.to_datetime(start)
    end = pd.to_datetime(end)

    # Generate list of months
    months = pd.date_range(start.date().replace(day=1), end, freq='1MS')
    months_str = months.strftime('%y%m')

    # Generate list of filenames
    filenames = [f"{station}{filetype}{m}.txt" for m in months_str]

    dfs = []  # Initialize list of monthly dataframes
    for f in filenames:
        try:
            dfi = read_srml(url + f, map_variables=map_variables)
            dfs.append(dfi)
        except urllib.error.HTTPError:
            warnings.warn(f"The following file was not found: {f}")

    data = pd.concat(dfs, axis='rows')

    meta = {'filetype': filetype,
            'station': station,
            'filenames': filenames}

    return data, meta