by Christoph Gohlke, Laboratory for Fluorescence Dynamics, University of California, Irvine
Updated on February, 2, 2022
This Python script uses the tifffile and imagecodecs packages to create a fsspec ReferenceFileSystem file in JSON format for the earthbigdata set, which consists of 1,033,422 GeoTIFF files stored on AWS. The ReferenceFileSystem is used to create a multi-dimensional xarray dataset.
See discussion at kerchunk/issues/78.
import os
import base64
import tifffile
import imagecodecs.numcodecs
import matplotlib.pyplot
import numcodecs
import fsspec
import xarray
import zarr
Call the aws command line app to recursively list all files in the earthbigdata set. Cache the output in a local file. Filter the list for TIFF files and remove the common path.
if not os.path.exists('earthbigdata.txt'):
os.system(
'aws s3 ls sentinel-1-global-coherence-earthbigdata/data/tiles'
' --recursive > earthbigdata.txt'
)
with open('earthbigdata.txt') as fh:
tiff_files = [
line.split()[-1][11:] for line in fh.readlines() if '.tif' in line
]
print('Number of TIFF files:', len(tiff_files))
Number of TIFF files: 1033422
Define labels, coordinate arrays, file name regex patterns, and categories for all dimensions in the earthbigdata set.
baseurl = (
'https://'
'sentinel-1-global-coherence-earthbigdata.s3.us-west-2.amazonaws.com'
'/data/tiles/'
)
chunkshape = (1200, 1200)
fillvalue = 0
latitude_label = 'latitude'
latitude_pattern = rf'(?P<{latitude_label}>[NS]\d+)'
latitude_coordinates = [
(j * -0.00083333333 - 0.000416666665 + i)
for i in range(82, -79, -1)
for j in range(1200)
]
latitude_category = {}
i = 0
for j in range(82, -1, -1):
latitude_category[f'N{j:-02}'] = i
i += 1
for j in range(1, 79):
latitude_category[f'S{j:-02}'] = i
i += 1
longitude_label = 'longitude'
longitude_pattern = rf'(?P<{longitude_label}>[EW]\d+)'
longitude_coordinates = [
(j * 0.00083333333 + 0.000416666665 + i)
for i in range(-180, 180)
for j in range(1200)
]
longitude_category = {}
i = 0
for j in range(180, 0, -1):
longitude_category[f'W{j:-03}'] = i
i += 1
for j in range(180):
longitude_category[f'E{j:-03}'] = i
i += 1
season_label = 'season'
season_category = {'winter': 0, 'spring': 1, 'summer': 2, 'fall': 3}
season_coordinates = list(season_category.keys())
season_pattern = rf'_(?P<{season_label}>{"|".join(season_category)})'
polarization_label = 'polarization'
polarization_category = {'vv': 0, 'vh': 1, 'hv': 2, 'hh': 3}
polarization_coordinates = list(polarization_category.keys())
polarization_pattern = (
rf'_(?P<{polarization_label}>{"|".join(polarization_category)})'
)
coherence_label = 'coherence'
coherence_category = {
'06': 0,
'12': 1,
'18': 2,
'24': 3,
'36': 4,
'48': 5,
}
coherence_coordinates = list(int(i) for i in coherence_category.keys())
coherence_pattern = (
rf'_COH(?P<{coherence_label}>{"|".join(coherence_category)})'
)
orbit_label = 'orbit'
orbit_coordinates = list(range(1, 176))
orbit_pattern = rf'_(?P<{orbit_label}>\d+)'
flightdirection_label = 'flightdirection'
flightdirection_category = {'A': 0, 'D': 1}
flightdirection_coordinates = list(flightdirection_category.keys())
flightdirection_pattern = (
rf'(?P<{flightdirection_label}>[{"|".join(flightdirection_category)}])_'
)
jsonfile = open('earthbigdata.json', 'w', newline='\n')
Add the coordinate arrays to a zarr group, convert it to a fsspec ReferenceFileSystem JSON string, and write it to the open file.
coordinates = {}
zarrgroup = zarr.open_group(coordinates)
zarrgroup.array(
longitude_label, data=longitude_coordinates, dtype='float32'
).attrs['_ARRAY_DIMENSIONS'] = [longitude_label]
zarrgroup.array(
latitude_label, data=latitude_coordinates, dtype='float32'
).attrs['_ARRAY_DIMENSIONS'] = [latitude_label]
zarrgroup.array(
season_label,
data=season_coordinates,
dtype=object,
object_codec=numcodecs.VLenUTF8(),
compression=None,
).attrs['_ARRAY_DIMENSIONS'] = [season_label]
zarrgroup.array(
polarization_label,
data=polarization_coordinates,
dtype=object,
object_codec=numcodecs.VLenUTF8(),
compression=None,
).attrs['_ARRAY_DIMENSIONS'] = [polarization_label]
zarrgroup.array(
coherence_label,
data=coherence_coordinates,
dtype='uint8',
compression=None,
).attrs['_ARRAY_DIMENSIONS'] = [coherence_label]
zarrgroup.array(orbit_label, data=orbit_coordinates, dtype='int32').attrs[
'_ARRAY_DIMENSIONS'
] = [orbit_label]
zarrgroup.array(
flightdirection_label,
data=flightdirection_coordinates,
dtype=object,
object_codec=numcodecs.VLenUTF8(),
compression=None,
).attrs['_ARRAY_DIMENSIONS'] = [flightdirection_label]
# base64 encode any values containing non-ascii characters
for k, v in coordinates.items():
try:
coordinates[k] = v.decode()
except UnicodeDecodeError:
coordinates[k] = 'base64:' + base64.b64encode(v).decode()
coordinates_json = tifffile.ZarrStore._json(coordinates).decode()
jsonfile.write(coordinates_json[:-2]) # skip the last newline and brace
69839
Filter the list of GeoTIFF files for files containing coherence 'COH' data. The regex pattern and categories are used to parse the file names for chunk indices.
Note: the created TiffSequence cannot be used to access any files. The file names do not refer to exising files. The baseurl is later used to get the real location of the files.
mode = 'COH'
fileseq = tifffile.TiffSequence(
[file for file in tiff_files if '_' + mode in file],
pattern=(
latitude_pattern
+ longitude_pattern
+ season_pattern
+ polarization_pattern
+ coherence_pattern
),
categories={
latitude_label: latitude_category,
longitude_label: longitude_category,
season_label: season_category,
polarization_label: polarization_category,
coherence_label: coherence_category,
},
)
assert len(fileseq.files) == 444821
assert fileseq.files_missing == 5119339
assert fileseq.shape == (161, 360, 4, 4, 6)
assert fileseq.labels == (
'latitude',
'longitude',
'season',
'polarization',
'coherence',
)
print(fileseq)
TiffSequence N00E005_fall_vv_COH12.tif files: 444821 (5119339 missing) shape: 161, 360, 4, 4, 6 labels: latitude, longitude, season, polarization, coherence
Define 'axestiled' to tile the latitude and longitude dimensions of the TiffSequence with the first and second image/chunk dimensions. Define extra 'zattrs' to create a xarray compatible store.
store = fileseq.aszarr(
dtype='uint8',
chunkshape=chunkshape,
fillvalue=fillvalue,
axestiled={0: 0, 1: 1},
zattrs={
'_ARRAY_DIMENSIONS': [
season_label,
polarization_label,
coherence_label,
latitude_label,
longitude_label,
]
},
)
print(store)
ZarrFileSequenceStore shape: 4, 4, 6, 193200, 432000 chunks: 1, 1, 1, 1200, 1200 dtype: uint8 fillvalue: 0
Use the mode name to create a zarr subgroup. Use the 'imagecodecs_tiff' numcodecs compatible codec for decoding TIFF files.
store.write_fsspec(
jsonfile,
baseurl,
groupname=mode,
codec_id='imagecodecs_tiff',
_append=True,
)
Repeat the TiffSequence->aszarr->write_fsspec workflow for the other modes.
for mode in (
'AMP',
'tau',
'rmse',
'rho',
):
fileseq = tifffile.TiffSequence(
[file for file in tiff_files if '_' + mode in file],
pattern=(
latitude_pattern
+ longitude_pattern
+ season_pattern
+ polarization_pattern
),
categories={
latitude_label: latitude_category,
longitude_label: longitude_category,
season_label: season_category,
polarization_label: polarization_category,
},
)
print(fileseq)
with fileseq.aszarr(
dtype='uint16',
chunkshape=chunkshape,
fillvalue=fillvalue,
axestiled={0: 0, 1: 1},
zattrs={
'_ARRAY_DIMENSIONS': [
season_label,
polarization_label,
latitude_label,
longitude_label,
]
},
) as store:
print(store)
store.write_fsspec(
jsonfile,
baseurl,
groupname=mode,
codec_id='imagecodecs_tiff',
_append=True,
)
for mode in ('inc', 'lsmap'):
fileseq = tifffile.TiffSequence(
[file for file in tiff_files if '_' + mode in file],
pattern=(
latitude_pattern
+ longitude_pattern
+ orbit_pattern
+ flightdirection_pattern
),
categories={
latitude_label: latitude_category,
longitude_label: longitude_category,
# orbit has no category
flightdirection_label: flightdirection_category,
},
)
print(fileseq)
with fileseq.aszarr(
dtype='uint8',
chunkshape=chunkshape,
fillvalue=fillvalue,
axestiled={0: 0, 1: 1},
zattrs={
'_ARRAY_DIMENSIONS': [
orbit_label,
flightdirection_label,
latitude_label,
longitude_label,
]
},
) as store:
print(store)
store.write_fsspec(
jsonfile,
baseurl,
groupname=mode,
codec_id='imagecodecs_tiff',
_append=True,
)
TiffSequence N00E005_fall_vh_AMP.tif files: 187693 (739667 missing) shape: 161, 360, 4, 4 labels: latitude, longitude, season, polarization ZarrFileSequenceStore shape: 4, 4, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint16 fillvalue: 0 TiffSequence N00E005_fall_vv_tau.tif files: 98331 (829029 missing) shape: 161, 360, 4, 4 labels: latitude, longitude, season, polarization ZarrFileSequenceStore shape: 4, 4, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint16 fillvalue: 0 TiffSequence N00E005_fall_vv_rmse.tif files: 98331 (829029 missing) shape: 161, 360, 4, 4 labels: latitude, longitude, season, polarization ZarrFileSequenceStore shape: 4, 4, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint16 fillvalue: 0 TiffSequence N00E005_fall_vv_rho.tif files: 98331 (829029 missing) shape: 161, 360, 4, 4 labels: latitude, longitude, season, polarization ZarrFileSequenceStore shape: 4, 4, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint16 fillvalue: 0 TiffSequence N00E005_124D_inc.tif files: 52956 (20233044 missing) shape: 161, 360, 175, 2 labels: latitude, longitude, orbit, flightdirection ZarrFileSequenceStore shape: 175, 2, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint8 fillvalue: 0 TiffSequence N00E005_124D_lsmap.tif files: 52959 (20233041 missing) shape: 161, 360, 175, 2 labels: latitude, longitude, orbit, flightdirection ZarrFileSequenceStore shape: 175, 2, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint8 fillvalue: 0
jsonfile.write('\n}')
jsonfile.close()
Register imagecodecs.numcodecs before using the ReferenceFileSystem.
imagecodecs.numcodecs.register_codecs()
Specify the 'target_protocol' to load a local file.
mapper = fsspec.get_mapper(
'reference://',
fo='earthbigdata.json',
target_protocol='file',
remote_protocol='http',
)
Use 'mask_and_scale' to disable conversion to floating point.
dataset = xarray.open_dataset(
mapper,
engine='zarr',
mask_and_scale=False,
backend_kwargs={'consolidated': False},
)
print(dataset)
<xarray.Dataset> Dimensions: (season: 4, polarization: 4, latitude: 193200, longitude: 432000, coherence: 6, flightdirection: 2, orbit: 175) Coordinates: * coherence (coherence) uint8 6 12 18 24 36 48 * flightdirection (flightdirection) object 'A' 'D' * latitude (latitude) float32 82.0 82.0 82.0 ... -79.0 -79.0 -79.0 * longitude (longitude) float32 -180.0 -180.0 -180.0 ... 180.0 180.0 * orbit (orbit) int32 1 2 3 4 5 6 7 ... 169 170 171 172 173 174 175 * polarization (polarization) object 'vv' 'vh' 'hv' 'hh' * season (season) object 'winter' 'spring' 'summer' 'fall' Data variables: AMP (season, polarization, latitude, longitude) uint16 ... COH (season, polarization, coherence, latitude, longitude) uint8 ... inc (orbit, flightdirection, latitude, longitude) uint8 ... lsmap (orbit, flightdirection, latitude, longitude) uint8 ... rho (season, polarization, latitude, longitude) uint16 ... rmse (season, polarization, latitude, longitude) uint16 ... tau (season, polarization, latitude, longitude) uint16 ...
socal = dataset.sel(latitude=slice(36, 32.5), longitude=slice(-121, -115))
print(socal)
<xarray.Dataset> Dimensions: (season: 4, polarization: 4, latitude: 4200, longitude: 7200, coherence: 6, flightdirection: 2, orbit: 175) Coordinates: * coherence (coherence) uint8 6 12 18 24 36 48 * flightdirection (flightdirection) object 'A' 'D' * latitude (latitude) float32 36.0 36.0 36.0 36.0 ... 32.5 32.5 32.5 * longitude (longitude) float32 -121.0 -121.0 -121.0 ... -115.0 -115.0 * orbit (orbit) int32 1 2 3 4 5 6 7 ... 169 170 171 172 173 174 175 * polarization (polarization) object 'vv' 'vh' 'hv' 'hh' * season (season) object 'winter' 'spring' 'summer' 'fall' Data variables: AMP (season, polarization, latitude, longitude) uint16 ... COH (season, polarization, coherence, latitude, longitude) uint8 ... inc (orbit, flightdirection, latitude, longitude) uint8 ... lsmap (orbit, flightdirection, latitude, longitude) uint8 ... rho (season, polarization, latitude, longitude) uint16 ... rmse (season, polarization, latitude, longitude) uint16 ... tau (season, polarization, latitude, longitude) uint16 ...
The few GeoTIFF files comprising the selection are transparently downloaded, decoded, and stitched to an in-memory numpy array and plotted using matplotlib.
image = socal['COH'].loc['winter', 'vv', 12]
assert image[100, 100] == 53
xarray.plot.imshow(image, size=6, aspect=1.8)
matplotlib.pyplot.show()