pvlib-python/pvlib/iotools/bsrn.py at 58b272c3c1d29c16359ff14cd9d64d68a2b571c6 · AdamRJensen/pvlib-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
"""Functions to read data from the Baseline Surface Radiation Network (BSRN).
.. codeauthor:: Adam R. Jensen<adam-r-j@hotmail.com>
"""

import pandas as pd
import gzip
import ftplib
import warnings
import io
import os

BSRN_FTP_URL = "ftp.bsrn.awi.de"

BSRN_LR0100_COL_SPECS = [(0, 3), (4, 9), (10, 16), (16, 22), (22, 27),
                         (27, 32), (32, 39), (39, 45), (45, 50), (50, 55),
                         (55, 64), (64, 70), (70, 75)]

BSRN_LR0300_COL_SPECS = [(1, 3), (4, 9), (10, 16), (16, 22), (22, 27),
                         (27, 31), (31, 38), (38, 44), (44, 49), (49, 54),
                         (54, 61), (61, 67), (67, 72), (72, 78)]

BSRN_LR0500_COL_SPECS = [(0, 3), (3, 8), (8, 14), (14, 20), (20, 26), (26, 32),
                         (32, 38), (38, 44), (44, 50), (50, 56), (56, 62),
                         (62, 68), (68, 74), (74, 80)]

BSRN_LR0100_COLUMNS = ['day', 'minute',
                       'ghi', 'ghi_std', 'ghi_min', 'ghi_max',
                       'dni', 'dni_std', 'dni_min', 'dni_max',
                       'empty', 'empty', 'empty', 'empty', 'empty',
                       'dhi', 'dhi_std', 'dhi_min', 'dhi_max',
                       'lwd', 'lwd_std', 'lwd_min', 'lwd_max',
                       'temp_air', 'relative_humidity', 'pressure']

BSRN_LR0300_COLUMNS = ['day', 'minute', 'gri', 'gri_std', 'gri_min', 'gri_max',
                       'lwu', 'lwu_std', 'lwu_min', 'lwu_max', 'net_radiation',
                       'net_radiation_std', 'net_radiation_min',
                       'net_radiation_max']

BSRN_LR0500_COLUMNS = ['day', 'minute', 'uva_global', 'uva_global_std',
                       'uva_global_min', 'uva_global_max', 'uvb_direct',
                       'uvb_direct_std', 'uvb_direct_min', 'uvb_direct_max',
                       'empty', 'empty', 'empty', 'empty',
                       'uvb_global', 'uvb_global_std', 'uvb_global_min',
                       'uvb_global_max', 'uvb_diffuse', 'uvb_diffuse_std',
                       'uvb_diffuse', 'uvb_diffuse_std',
                       'uvb_diffuse_min', 'uvb_diffuse_max',
                       'uvb_reflected', 'uvb_reflected_std',
                       'uvb_reflected_min', 'uvb_reflected_max']

BSRN_COLUMNS = {'0100': BSRN_LR0100_COLUMNS, '0300': BSRN_LR0300_COLUMNS,
                '0500': BSRN_LR0500_COLUMNS}


def _empty_dataframe_from_logical_records(logical_records):
    # Create an empty DataFrame with the column names corresponding to the
    # requested logical records
    columns = []
    for lr in logical_records:
        columns += BSRN_COLUMNS[lr][2:]
    columns = [c for c in columns if c != 'empty']
    return pd.DataFrame(columns=columns)


def get_bsrn(station, start, end, username, password,
             logical_records=('0100',), save_path=None):
    """
    Retrieve ground measured irradiance data from the BSRN FTP server.

    The BSRN (Baseline Surface Radiation Network) is a world wide network
    of high-quality solar radiation monitoring stations as described in [1]_.
    Data is retrieved from the BSRN FTP server [2]_.

    Data is returned for the entire months between and including start and end.

    Parameters
    ----------
    station: str
        3-letter BSRN station abbreviation
    start: datetime-like
        First day of the requested period
    end: datetime-like
        Last day of the requested period
    username: str
        username for accessing the BSRN FTP server
    password: str
        password for accessing the BSRN FTP server
    logical_records: list or tuple, default: ('0100',)
        List of the logical records (LR) to parse. Options include: '0100',
        '0300', and '0500'.
    save_path: str or path-like, optional
        If specified, a directory path of where to save files.

    Returns
    -------
    data: DataFrame
        timeseries data from the BSRN archive, see
        :func:`pvlib.iotools.read_bsrn` for fields. An empty DataFrame is
        returned if no data was found for the time period.
    metadata: dict
        metadata for the last available monthly file.

    Raises
    ------
    KeyError
        If the specified station does not exist on the FTP server.

    Warns
    -----
    UserWarning
        If one or more requested files are missing a UserWarning is returned
        with a list of the filenames missing. If no files match the specified
        station and timeframe a seperate UserWarning is given.

    Notes
    -----
    The username and password for the BSRN FTP server can be obtained for free
    as described in the BSRN's Data Release Guidelines [3]_.

    Currently only parsing of logical records 0100, 0300 and 0500 is supported.
    Note not all stations measure LR0300 and LR0500. However, LR0100 is
    mandatory as it contains the basic irradiance and auxillary measurements.
    See [4]_ for a description of the different logical records. Future updates
    may include parsing of additional data and metadata.

    Important
    ---------
    While data from the BSRN is generally of high-quality, measurement data
    should always be quality controlled before usage!

    Examples
    --------
    >>> # Retrieve two months irradiance data from the Cabauw BSRN station
    >>> data, metadata = pvlib.iotools.get_bsrn(  # doctest: +SKIP
    >>>     start=pd.Timestamp(2020,1,1), end=pd.Timestamp(2020,12,1),   # doctest: +SKIP
    >>>     station='cab', username='yourusername', password='yourpassword')  # doctest: +SKIP

    See Also
    --------
    pvlib.iotools.read_bsrn, pvlib.iotools.parse_bsrn

    References
    ----------
    .. [1] `World Radiation Monitoring Center - Baseline Surface Radiation
        Network (BSRN)
        <https://bsrn.awi.de/>`_
    .. [2] `BSRN Data Retrieval via FTP
       <https://bsrn.awi.de/data/data-retrieval-via-ftp/>`_
    .. [4] `BSRN Data Release Guidelines
       <https://bsrn.awi.de/data/conditions-of-data-release/>`_
    .. [3] `Update of the Technical Plan for BSRN Data Management, 2013,
       Global Climate Observing System (GCOS) GCOS-174.
       <https://bsrn.awi.de/fileadmin/user_upload/bsrn.awi.de/Publications/gcos-174.pdf>`_
    """  # noqa: E501
    # The FTP server uses lowercase station abbreviations
    station = station.lower()

    # Generate list files to download based on start/end (SSSMMYY.dat.gz)
    filenames = pd.date_range(
        start, end.replace(day=1) + pd.DateOffset(months=1), freq='1M')\
        .strftime(f"{station}%m%y.dat.gz").tolist()

    # Create FTP connection
    with ftplib.FTP(BSRN_FTP_URL, username, password) as ftp:
        # Change to station sub-directory (checks that the station exists)
        try:
            ftp.cwd(f'/{station}')
        except ftplib.error_perm as e:
            raise KeyError('Station sub-directory does not exist. Specified '
                           'station is probably not a proper three letter '
                           'station abbreviation.') from e
        dfs = []  # Initialize list for monthly dataframes
        non_existing_files = []  # Initilize list of files that were not found
        for filename in filenames:
            try:
                bio = io.BytesIO()  # Initialize BytesIO object
                # Retrieve binary file from server and write to BytesIO object
                response = ftp.retrbinary(f'RETR {filename}', bio.write)
                # Check that transfer was successfull
                if not response.startswith('226 Transfer complete'):
                    raise ftplib.Error(response)
                # Save file locally if save_path is specified
                if save_path is not None:
                    # Create local file
                    with open(os.path.join(save_path, filename), 'wb') as f:
                        f.write(bio.getbuffer())  # Write local file
                # Open gzip file and convert to StringIO
                bio.seek(0)  # reset buffer to start of file
                gzip_file = io.TextIOWrapper(gzip.GzipFile(fileobj=bio),
                                             encoding='latin1')
                dfi, metadata = parse_bsrn(gzip_file, logical_records)
                dfs.append(dfi)
            # FTP client raises an error if the file does not exist on server
            except ftplib.error_perm as e:
                if str(e) == '550 Failed to open file.':
                    non_existing_files.append(filename)
                else:
                    raise ftplib.error_perm(e)
        ftp.quit()  # Close and exit FTP connection

    # Raise user warnings
    if not dfs:  # If no files were found
        warnings.warn('No files were available for the specified timeframe.')
    elif non_existing_files:  # If only some files were missing
        warnings.warn(f'The following files were not found: {non_existing_files}')  # noqa: E501

    # Concatenate monthly dataframes to one dataframe
    if len(dfs):
        data = pd.concat(dfs, axis='rows')
    else:  # Return empty dataframe
        data = _empty_dataframe_from_logical_records(logical_records)
        metadata = {}
    # Return dataframe and metadata (metadata belongs to last available file)
    return data, metadata


def parse_bsrn(fbuf, logical_records=('0100',)):
    """
    Parse a file-like buffer of a BSRN station-to-archive file.

    Parameters
    ----------
    fbuf: file-like buffer
        Buffer of a BSRN station-to-archive data file
    logical_records: list or tuple, default: ('0100',)
        List of the logical records (LR) to parse. Options include: '0100',
        '0300', and '0500'.

    Returns
    -------
    data: DataFrame
        timeseries data from the BSRN archive, see
        :func:`pvlib.iotools.read_bsrn` for fields. An empty DataFrame is
        returned if the specified logical records were not found.
    metadata: dict
        Dictionary containing metadata (primarily from LR0004).

    See Also
    --------
    pvlib.iotools.read_bsrn, pvlib.iotools.get_bsrn

    """
    # Parse metadata
    fbuf.readline()  # first line should be *U0001, so read it and discard
    date_line = fbuf.readline()  # second line contains important metadata
    start_date = pd.Timestamp(year=int(date_line[7:11]),
                              month=int(date_line[3:6]), day=1,
                              tz='UTC')  # BSRN timestamps are UTC

    metadata = {}  # Initilize dictionary containing metadata
    metadata['start date'] = start_date
    metadata['station identification number'] = int(date_line[:3])
    metadata['version of data'] = int(date_line.split()[-1])
    for line in fbuf:
        if line[2:6] == '0004':  # stop once LR0004 has been reached
            break
        elif line == '':
            raise ValueError('Mandatory record LR0004 not found.')
    metadata['date when station description changed'] = fbuf.readline().strip()
    metadata['surface type'] = int(fbuf.readline(3))
    metadata['topography type'] = int(fbuf.readline())
    metadata['address'] = fbuf.readline().strip()
    metadata['telephone no. of station'] = fbuf.readline(20).strip()
    metadata['FAX no. of station'] = fbuf.readline().strip()
    metadata['TCP/IP no. of station'] = fbuf.readline(15).strip()
    metadata['e-mail address of station'] = fbuf.readline().strip()
    metadata['latitude_bsrn'] = float(fbuf.readline(8))  # BSRN convention
    metadata['latitude'] = metadata['latitude_bsrn'] - 90  # ISO 19115
    metadata['longitude_bsrn'] = float(fbuf.readline(8))  # BSRN convention
    metadata['longitude'] = metadata['longitude_bsrn'] - 180  # ISO 19115
    metadata['altitude'] = int(fbuf.readline(5))
    metadata['identification of "SYNOP" station'] = fbuf.readline().strip()
    metadata['date when horizon changed'] = fbuf.readline().strip()
    # Pass last section of LR0004 containing the horizon elevation data
    horizon = []  # list for raw horizon elevation data
    while True:
        line = fbuf.readline()
        if ('*' in line) | (line == ''):
            break
        else:
            horizon += [int(i) for i in line.split()]
    horizon = pd.Series(horizon[1::2], horizon[::2], name='horizon_elevation',
                        dtype=int).drop(-1, errors='ignore').sort_index()
    horizon.index.name = 'azimuth'
    metadata['horizon'] = horizon

    # Read file and store the starting line number and number of lines for
    # each logical record (LR)
    fbuf.seek(0)  # reset buffer to start of file
    lr_startrow = {}  # Dictionary of starting line number for each LR
    lr_nrows = {}  # Dictionary of end line number for each LR
    for num, line in enumerate(fbuf):
        if line.startswith('*'):  # Find start of all logical records
            if len(lr_startrow) >= 1:
                lr_nrows[lr] = num - lr_startrow[lr] - 1  # noqa: F821
            lr = line[2:6]  # string of 4 digit LR number
            lr_startrow[lr] = num
    lr_nrows[lr] = num - lr_startrow[lr]

    for lr in logical_records:
        if lr not in ['0100', '0300', '0500']:
            raise ValueError(f"Logical record {lr} not in "
                             "['0100', '0300','0500'].")
    dfs = []  # Initialize empty list for dataframe

    # Parse LR0100 - basic measurements including GHI, DNI, DHI and temperature
    if ('0100' in lr_startrow.keys()) & ('0100' in logical_records):
        fbuf.seek(0)  # reset buffer to start of file
        LR_0100 = pd.read_fwf(fbuf, skiprows=lr_startrow['0100'] + 1,
                              nrows=lr_nrows['0100'], header=None,
                              colspecs=BSRN_LR0100_COL_SPECS,
                              na_values=[-999.0, -99.9])
        # Create multi-index and unstack, resulting in 1 col for each variable
        LR_0100 = LR_0100.set_index([LR_0100.index // 2, LR_0100.index % 2])
        LR_0100 = LR_0100.unstack(level=1).swaplevel(i=0, j=1, axis='columns')
        # Sort columns to match original order and assign column names
        LR_0100 = LR_0100.reindex(sorted(LR_0100.columns), axis='columns')
        LR_0100.columns = BSRN_LR0100_COLUMNS
        # Set datetime index
        LR_0100.index = (start_date+pd.to_timedelta(LR_0100['day']-1, unit='d')
                         + pd.to_timedelta(LR_0100['minute'], unit='T'))
        # Drop empty, minute, and day columns
        LR_0100 = LR_0100.drop(columns=['empty', 'day', 'minute'])
        dfs.append(LR_0100)

    # Parse LR0300 - other time series data, including upward and net radiation
    if ('0300' in lr_startrow.keys()) & ('0300' in logical_records):
        fbuf.seek(0)  # reset buffer to start of file
        LR_0300 = pd.read_fwf(fbuf, skiprows=lr_startrow['0300']+1,
                              nrows=lr_nrows['0300'], header=None,
                              na_values=[-999.0, -99.9],
                              colspecs=BSRN_LR0300_COL_SPECS,
                              names=BSRN_LR0300_COLUMNS)
        LR_0300.index = (start_date+pd.to_timedelta(LR_0300['day']-1, unit='d')
                         + pd.to_timedelta(LR_0300['minute'], unit='T'))
        LR_0300 = LR_0300.drop(columns=['day', 'minute']).astype(float)
        dfs.append(LR_0300)

    # Parse LR0500 - UV measurements
    if ('0500' in lr_startrow.keys()) & ('0500' in logical_records):
        fbuf.seek(0)  # reset buffer to start of file
        LR_0500 = pd.read_fwf(fbuf, skiprows=lr_startrow['0500']+1,
                              nrows=lr_nrows['0500'], na_values=[-99.9],
                              header=None, colspecs=BSRN_LR0500_COL_SPECS)
        # Create multi-index and unstack, resulting in 1 col for each variable
        LR_0500 = LR_0500.set_index([LR_0500.index // 2, LR_0500.index % 2])
        LR_0500 = LR_0500.unstack(level=1).swaplevel(i=0, j=1, axis='columns')
        # Sort columns to match original order and assign column names
        LR_0500 = LR_0500.reindex(sorted(LR_0500.columns), axis='columns')
        LR_0500.columns = BSRN_LR0500_COLUMNS
        LR_0500.index = (start_date+pd.to_timedelta(LR_0500['day']-1, unit='d')
                         + pd.to_timedelta(LR_0500['minute'], unit='T'))
        LR_0500 = LR_0500.drop(columns=['empty', 'day', 'minute'])
        dfs.append(LR_0500)

    if len(dfs):
        data = pd.concat(dfs, axis='columns')
    else:
        data = _empty_dataframe_from_logical_records(logical_records)
        metadata = {}
    return data, metadata


def read_bsrn(filename, logical_records=('0100',)):
    """
    Read a BSRN station-to-archive file into a DataFrame.

    The BSRN (Baseline Surface Radiation Network) is a world wide network
    of high-quality solar radiation monitoring stations as described in [1]_.
    The function is able to parse logical records (LR) 0100, 0300, and 0500.
    LR0100 contains the basic measurements, which include global, diffuse, and
    direct irradiance, as well as downwelling long-wave radiation [2]_. Future
    updates may include parsing of additional data and metadata.

    BSRN files are freely available and can be accessed via FTP [3]_. The
    username and password for the BSRN FTP server can be obtained for free as
    described in the BSRN's Data Release Guidelines [3]_.

    Parameters
    ----------
    filename: str or path-like
        Name or path of a BSRN station-to-archive data file
    logical_records: list or tuple, default: ('0100',)
        List of the logical records (LR) to parse. Options include: '0100',
        '0300', and '0500'.

    Returns
    -------
    data: DataFrame
        A DataFrame with the columns as described below. For a more extensive
        description of the variables, consult [2]_. An empty DataFrame is
        returned if the specified logical records were not found.
    metadata: dict
        Dictionary containing metadata (primarily from LR0004).

    Notes
    -----
    The data DataFrame for LR0100 includes the following fields:

    =======================  ======  ==========================================
    Key                      Format  Description
    =======================  ======  ==========================================
    **Logical record 0100**
    ---------------------------------------------------------------------------
    ghi†                     float   Mean global horizontal irradiance [W/m^2]
    dni†                     float   Mean direct normal irradiance [W/m^2]
    dhi†                     float   Mean diffuse horizontal irradiance [W/m^2]
    lwd†                     float   Mean. downward long-wave radiation [W/m^2]
    temp_air                 float   Air temperature [°C]
    relative_humidity        float   Relative humidity [%]
    pressure                 float   Atmospheric pressure [hPa]
    -----------------------  ------  ------------------------------------------
    **Logical record 0300**
    ---------------------------------------------------------------------------
    gri†                     float   Mean ground-reflected irradiance [W/m^2]
    lwu†                     float   Mean long-wave upwelling irradiance [W/m^2]
    net_radiation†           float   Mean net radiation (net radiometer) [W/m^2]
    -----------------------  ------  ------------------------------------------
    **Logical record 0500**
    ---------------------------------------------------------------------------
    uva_global†              float   Mean UV-A global irradiance [W/m^2]
    uvb_direct†              float   Mean UV-B direct irradiance [W/m^2]
    uvb_global†              float   Mean UV-B global irradiance [W/m^2]
    uvb_diffuse†             float   Mean UV-B diffuse irradiance [W/m^2]
    uvb_reflected†           float   Mean UV-B reflected irradiance [W/m^2]
    =======================  ======  ==========================================

    † Marked variables have corresponding columns for the standard deviation
    (_std), minimum (_min), and maximum (_max) calculated from the 60 samples
    that are average into each 1-minute measurement.

    Hint
    ----
    According to [2]_ "All time labels in the station-to-archive files denote
    the start of a time interval." This corresponds to left bin edge labeling.

    See Also
    --------
    pvlib.iotools.parse_bsrn, pvlib.iotools.get_bsrn

    References
    ----------
    .. [1] `World Radiation Monitoring Center - Baseline Surface Radiation
        Network (BSRN)
        <https://bsrn.awi.de/>`_
    .. [2] `Update of the Technical Plan for BSRN Data Management, 2013,
       Global Climate Observing System (GCOS) GCOS-174.
       <https://bsrn.awi.de/fileadmin/user_upload/bsrn.awi.de/Publications/gcos-174.pdf>`_
    .. [3] `BSRN Data Retrieval via FTP
       <https://bsrn.awi.de/data/data-retrieval-via-ftp/>`_
    .. [4] `BSRN Data Release Guidelines
       <https://bsrn.awi.de/data/conditions-of-data-release/>`_
    """  # noqa: E501
    if str(filename).endswith('.gz'):  # check if file is a gzipped (.gz) file
        open_func, mode = gzip.open, 'rt'
    else:
        open_func, mode = open, 'r'
    with open_func(filename, mode) as f:
        content = parse_bsrn(f, logical_records)
    return content