From 883f58544012de61185e4cc5e1ce392f9f14c7ec Mon Sep 17 00:00:00 2001 From: Chris Holden Date: Mon, 27 Jan 2025 16:11:09 -0500 Subject: [PATCH] feat: add script to apply ESA quality mask (#15) * Add CLI to apply per-band QA mask for lost or degraded pixel values * Update list of spectral bands to subset used by HLS * Add unit tests and fix JPEG2000 compression on update * Update README with apply_s2_quality_mask usage * Check the GENERAL_QUALITY metadata to see if we can exit early * granule_dir is actually a str in click==7.x --- README.md | 3 + apply_s2_quality_mask/__init__.py | 0 .../apply_s2_quality_mask.py | 188 ++++++++++++++++++ setup.py | 5 +- .../QI_DATA/GENERAL_QUALITY.xml | 49 +++++ .../QI_DATA/GENERAL_QUALITY.xml | 52 +++++ .../QI_DATA/GENERAL_QUALITY.xml | 52 +++++ tests/test_apply_s2_quality_mask.py | 140 +++++++++++++ 8 files changed, 488 insertions(+), 1 deletion(-) create mode 100644 apply_s2_quality_mask/__init__.py create mode 100644 apply_s2_quality_mask/apply_s2_quality_mask.py create mode 100644 tests/data/quality-mask/L1C_T12QXM_A048143_20240909T175112/QI_DATA/GENERAL_QUALITY.xml create mode 100644 tests/data/quality-mask/L1C_T43QDG_A031305_20230305T053523/QI_DATA/GENERAL_QUALITY.xml create mode 100644 tests/data/quality-mask/L1C_T45TXF_A038726_20221121T050115/QI_DATA/GENERAL_QUALITY.xml create mode 100644 tests/test_apply_s2_quality_mask.py diff --git a/README.md b/README.md index 2abdc1f..8c4af4a 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,9 @@ ## Usage ```bash +$ apply_s2_quality_mask INPUTS2DIR +``` +```bash $ check_solar_zenith_sentinel INPUTXML ``` ```bash diff --git a/apply_s2_quality_mask/__init__.py b/apply_s2_quality_mask/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apply_s2_quality_mask/apply_s2_quality_mask.py b/apply_s2_quality_mask/apply_s2_quality_mask.py new file mode 100644 index 0000000..e3a7cb8 --- /dev/null +++ b/apply_s2_quality_mask/apply_s2_quality_mask.py @@ -0,0 +1,188 @@ +from typing import List, Optional, Tuple, TypeVar +from pathlib import Path + +import click +import rasterio +from lxml import etree + + +# MSI band number definitions for bands used by HLS v2 product +# (coastal, blue/green/red, NIR, SWIR1, SWIR2) +SPECTRAL_BANDS = frozenset( + { + "B01", + "B02", + "B03", + "B04", + "B8A", + "B11", + "B12", + } +) + +# ensure compression is lossless to avoid changing values, +# https://gdal.org/en/stable/drivers/raster/jp2openjpeg.html#lossless-compression +JPEG2000_NO_COMPRESSION_OPTIONS = { + "QUALITY": "100", + "REVERSIBLE": "YES", + "YCBCR420": "NO", +} + + +T = TypeVar("T") + + +def _one_or_none(seq: List[T]) -> Optional[T]: + if len(seq) == 1: + return seq[0] + + +def find_affected_bands(granule_dir: Path) -> List[str]: + """Check the granule GENERAL_QUALITY.xml for affected bands, if any""" + report_path = _one_or_none(list(granule_dir.glob("**/QI_DATA/GENERAL_QUALITY.xml"))) + + # We couldn't find the report, so check all quality masks + if report_path is None: + return list(SPECTRAL_BANDS) + + quality_report = etree.parse(str(report_path)) + root = quality_report.getroot() + # give default namespace a name for use in xpath + nsmap = {"qa": root.nsmap[None]} + + # We have a XML structure like, + # + # + # There is data loss in this tile + # + # B01 B02 + # + # + # + # We want to grab the text from the "Affected_Bands" when the message + # indicates there is data loss in the tile. + data_loss_bands_txt = _one_or_none( + root.xpath( + ( + ".//qa:check[qa:message/text() = 'There is data loss in this tile']/" + "qa:extraValues/qa:value[@name = 'Affected_Bands']/text()" + ), + namespaces=nsmap, + ) + ) + if data_loss_bands_txt is not None: + return data_loss_bands_txt.split(" ") + return [] + + +def find_image_mask_pairs( + granule_dir: Path, bands: List[str] +) -> List[Tuple[Path, Path]]: + """Search granule directory for image + mask pairs + + The quality masks were produced in an imagery format since baseline 04.00 + (~Jan 25, 2022 and onward), so this function might return nothing + if run on granules created with older baselines. These older baselines + encoded the mask quality information in the GML (vector) format. + + The relevant parts of an unzipped granule looks like, + ``` + ${GRANULE_ID}.SAFE + ${GRANULE_ID}.SAFE/GRANULE + ${GRANULE_ID}.SAFE/GRANULE/L1C_T45TXF_A038726_20221121T050115 + ${GRANULE_ID}.SAFE/GRANULE/L1C_T45TXF_A038726_20221121T050115/IMG_DATA + ${GRANULE_ID}.SAFE/GRANULE/L1C_T45TXF_A038726_20221121T050115/IMG_DATA/T45TXF_20221121T050121_B01.jp2 + ... (*B*.jp2) + ${GRANULE_ID}.SAFE/GRANULE/L1C_T45TXF_A038726_20221121T050115/QI_DATA + ${GRANULE_ID}.SAFE/GRANULE/L1C_T45TXF_A038726_20221121T050115/QI_DATA/MSK_QUALIT_B01.jp2 + ... (MSK_QUALIT_B*.jp2) + ``` + + References + ---------- + https://sentinels.copernicus.eu/web/sentinel/-/copernicus-sentinel-2-major-products-upgrade-upcoming + """ + pairs = [] + for band in bands: + image = _one_or_none(list(granule_dir.glob(f"**/IMG_DATA/*{band}.jp2"))) + mask = _one_or_none(list(granule_dir.glob(f"**/QI_DATA/MSK_QUALIT_{band}.jp2"))) + if image and mask: + pairs.append((image, mask)) + + # Find all bands, or none + if len(pairs) == len(SPECTRAL_BANDS): + return pairs + return [] + + +def apply_quality_mask(image: Path, mask: Path): + """Apply Sentinel-2 image quality mask + + Each spectral band (`IMG_DATA/B*.jp2`) has a corresponding quality mask + image (`QI_DATA/MSK_QUALIT_B*.jp2`) of the same spatial resolution. The mask + image has 8 bands, with each band encoding a boolean for the presence/absence + of a quality issue. The bands indicate, + + 1: lost ancillary packets + 2: degraded ancillary packets + 3: lost MSI packets + 4: degraded MSI packets + 5: defective pixels + 6: no data + 7: partially corrected cross talk + 8: saturated pixels + + We have decided to mask 3 (lost MSI packets) and 4 (degraded MSI packets) only. + So far, 5 and 7 are present in B10-B11, because interpolation is used to fill 5 + and the cross talk is partially corrected. + Saturated pixels are still useful for the magnitude of the reflectance; keep it. + + We apply the mask by updating the spectral data images in-place rather than + creating another file. + + References + ---------- + https://sentinels.copernicus.eu/web/sentinel/technical-guides/sentinel-2-msi/data-quality-reports + """ + with rasterio.open(mask) as mask_src: + qa_bands = mask_src.read(indexes=[3, 4]) + lost_degraded_mask = (qa_bands == 1).any(axis=0) + + # only update imagery if mask shows it has any bad pixels + if lost_degraded_mask.any(): + click.echo(f"Masking lost or degraded pixel values in {image}") + with rasterio.open(image, "r+", **JPEG2000_NO_COMPRESSION_OPTIONS) as img_src: + img = img_src.read(1) + # L1C images don't define the nodata value on file so we can't update the + # mask (e.g., via `write_mask`) but 0 is used as the nodata value + # (see SPECIAL_VALUE_INDEX for NODATA in metadata) + img[lost_degraded_mask] = 0 + img_src.write(img, 1) + + +@click.command() +@click.argument( + "granule_dir", + type=click.Path(file_okay=False, dir_okay=True, exists=True), +) +@click.pass_context +def main(ctx, granule_dir: str): + """Update Sentinel-2 imagery by masking lost or degraded pixels""" + granule_dir = Path(granule_dir) + + affected_bands = find_affected_bands(granule_dir) + affected_hls_bands = SPECTRAL_BANDS.intersection(affected_bands) + if not affected_hls_bands: + click.echo(f"No bands are affected by data loss in {granule_dir}") + ctx.exit() + + click.echo(f"Applying Sentinel-2 QAQC mask to granule_dir={granule_dir}") + image_mask_pairs = find_image_mask_pairs(granule_dir, affected_hls_bands) + for (image, mask) in image_mask_pairs: + apply_quality_mask(image, mask) + + click.echo("Complete") + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 030c132..4263e3b 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,9 @@ "click~=7.1.0", "lxml", "boto3~=1.17.91", - "espa-python-library @ git+https://github.com/USGS-EROS/espa-python-library.git@v2.0.0#egg=espa-python-library" + "espa-python-library @ git+https://github.com/USGS-EROS/espa-python-library.git@v2.0.0#egg=espa-python-library", + "rasterio~=1.2", + "numpy~=1.16", ], include_package_data=True, extras_require={ @@ -16,6 +18,7 @@ "test": ["flake8", "pytest", "Jinja2==2.10.1", "moto[s3]~=2.0.8"] }, entry_points={"console_scripts": [ + "apply_s2_quality_mask=apply_s2_quality_mask.apply_s2_quality_mask:main", "parse_fmask=parse_fmask.parse_fmask:main", "check_solar_zenith_sentinel=check_solar_zenith_sentinel.check_solar_zenith_sentinel:main", "check_solar_zenith_landsat=check_solar_zenith_landsat.check_solar_zenith_landsat:main", diff --git a/tests/data/quality-mask/L1C_T12QXM_A048143_20240909T175112/QI_DATA/GENERAL_QUALITY.xml b/tests/data/quality-mask/L1C_T12QXM_A048143_20240909T175112/QI_DATA/GENERAL_QUALITY.xml new file mode 100644 index 0000000..e63f69e --- /dev/null +++ b/tests/data/quality-mask/L1C_T12QXM_A048143_20240909T175112/QI_DATA/GENERAL_QUALITY.xml @@ -0,0 +1,49 @@ + + + + GRANULE/L1C_T12QXM_A048143_20240909T175112/QI_DATA/GENERAL_QUALITY.xml + Report produced by OLQC-SC + + S2A + OPER + REP_OLQCPA + + UTC=2024-07-23T03:00:00 + UTC=2100-01-01T00:00:00 + + 2 + + OLQC-SC + OLQC-SC + 06.03.00 + UTC=2024-09-10T00:44:41 + + + + + + + S2A_OPER_GIP_OLQCPA_MPC__20240717T000043_V20240723T030000 + GENERAL_QUALITY + 1.0 + + + + Zenith solar angle (26.6116855171579) is within the threshold limit (82.0) + + 82.0 + 26.6116855171579 + + + + + Metadata file is present. + + + + No data loss in this tile. + + + + + diff --git a/tests/data/quality-mask/L1C_T43QDG_A031305_20230305T053523/QI_DATA/GENERAL_QUALITY.xml b/tests/data/quality-mask/L1C_T43QDG_A031305_20230305T053523/QI_DATA/GENERAL_QUALITY.xml new file mode 100644 index 0000000..3638e89 --- /dev/null +++ b/tests/data/quality-mask/L1C_T43QDG_A031305_20230305T053523/QI_DATA/GENERAL_QUALITY.xml @@ -0,0 +1,52 @@ + + + + GRANULE/L1C_T43QDG_A031305_20230305T053523/QI_DATA/GENERAL_QUALITY.xml + Report produced by OLQC-SC + + S2B + OPER + REP_OLQCPA + + UTC=2022-08-30T00:25:00 + UTC=2100-01-01T00:00:00 + + 2 + + OLQC-SC + OLQC-SC + 06.01.00 + UTC=2023-03-05T08:36:44 + + + + + + + S2B_OPER_GIP_OLQCPA_MPC__20220715T000042_V20220830T002500 + GENERAL_QUALITY + 1.0 + + + + Metadata file is present. + + + + Zenith solar angle (37.349671777297) is within the threshold limit (82.0) + + 82.0 + 37.349671777297 + + + + + There is data loss in this tile + + B01 B02 B03 B09 B10 B11 B12 B8A + + + + + + diff --git a/tests/data/quality-mask/L1C_T45TXF_A038726_20221121T050115/QI_DATA/GENERAL_QUALITY.xml b/tests/data/quality-mask/L1C_T45TXF_A038726_20221121T050115/QI_DATA/GENERAL_QUALITY.xml new file mode 100644 index 0000000..8a989dc --- /dev/null +++ b/tests/data/quality-mask/L1C_T45TXF_A038726_20221121T050115/QI_DATA/GENERAL_QUALITY.xml @@ -0,0 +1,52 @@ + + + + GRANULE/L1C_T45TXF_A038726_20221121T050115/QI_DATA/GENERAL_QUALITY.xml + Report produced by OLQC-SC + + S2A + OPER + REP_OLQCPA + + UTC=2022-08-30T00:25:00 + UTC=2100-01-01T00:00:00 + + 2 + + OLQC-SC + OLQC-SC + 05.01.00 + UTC=2022-11-21T07:37:39 + + + + + + + S2A_OPER_GIP_OLQCPA_MPC__20220715T000042_V20220830T002500 + GENERAL_QUALITY + 1.0 + + + + Metadata file is present. + + + + Zenith solar angle (61.7640691258715) is within the threshold limit (82.0) + + 82.0 + 61.7640691258715 + + + + + There is data loss in this tile + + B01 B02 B03 B04 B05 B06 B07 B08 B09 B10 B11 B12 B8A + + + + + + diff --git a/tests/test_apply_s2_quality_mask.py b/tests/test_apply_s2_quality_mask.py new file mode 100644 index 0000000..ecc690b --- /dev/null +++ b/tests/test_apply_s2_quality_mask.py @@ -0,0 +1,140 @@ +from pathlib import Path +from typing import List, Tuple + +import numpy as np +import pytest +import rasterio +from affine import Affine +from rasterio.crs import CRS + +from apply_s2_quality_mask.apply_s2_quality_mask import ( + JPEG2000_NO_COMPRESSION_OPTIONS, + SPECTRAL_BANDS, + apply_quality_mask, + find_affected_bands, + find_image_mask_pairs, +) + +TEST_DATA = Path(__file__).parents[0].joinpath("data", "quality-mask") + + +@pytest.mark.parametrize(["granule_dir", "affected_bands"], [ + pytest.param( + TEST_DATA / "L1C_T45TXF_A038726_20221121T050115", + ["B01", "B02", "B03", "B04", "B05", "B06", "B07", "B08", "B09", "B10", "B11", "B12", "B8A"], + id="All bands affected", + ), + pytest.param( + TEST_DATA / "L1C_T43QDG_A031305_20230305T053523", + ["B01", "B02", "B03", "B09", "B10", "B11", "B12", "B8A"], + id="Subset of bands affected", + ), + pytest.param( + TEST_DATA / "L1C_T12QXM_A048143_20240909T175112", + [], + id="No bands affected", + ), + pytest.param( + TEST_DATA, + SPECTRAL_BANDS, + id="No metadata file present", + ) +]) +def test_find_affected_bands(granule_dir: Path, affected_bands: List[str]): + """Test we find the bands affected by a quality issue correctly from metadata""" + test = find_affected_bands(granule_dir) + assert set(test) == set(affected_bands) + + +def make_fake_s2_granule( + prefix: Path, + band: str, + mask_data: np.ndarray, +) -> Tuple[Path, Path]: + """Create a fake S2 L1C granule""" + + profile = { + "width": mask_data.shape[1], + "height": mask_data.shape[2], + "crs": CRS.from_epsg(32643), + "transform": Affine(10.0, 0.0, 399960.0, 0.0, -10.0, 2700000.0), + "driver": "JP2OpenJPEG", + **JPEG2000_NO_COMPRESSION_OPTIONS, + } + + image_path = prefix / "IMG_DATA" / f"T45TXF_20221121T050121_{band}.jp2" + image_path.parent.mkdir(exist_ok=True, parents=True) + with rasterio.open(image_path, "w", count=1, dtype="uint16", **profile) as dst: + dst.write(np.ones(mask_data.shape[1:]), 1) + + mask_path = prefix / "QI_DATA" / f"MSK_QUALIT_{band}.jp2" + mask_path.parent.mkdir(exist_ok=True, parents=True) + with rasterio.open(mask_path, "w", count=8, dtype="uint8", **profile) as dst: + dst.write(mask_data) + + return image_path, mask_path + + +def test_find_image_mask_pairs_found_all(tmp_path: Path): + """Test successfully finding location of all imagery data""" + granule_id = "L1C_T45TXF_A038726_20221121T050115" + granule_prefix = tmp_path / f"{granule_id}.SAFE" / "GRANULE" / granule_id + + mask_data = np.zeros((8, 1, 1), dtype="uint8") + + expected_images_masks = [] + for band in SPECTRAL_BANDS: + expected_images_masks.append(make_fake_s2_granule(granule_prefix, band, mask_data)) + + # Make sure we find all of the expected paths + image_mask_pairs = find_image_mask_pairs(tmp_path, SPECTRAL_BANDS) + + for image_mask_pair in expected_images_masks: + assert image_mask_pair in image_mask_pairs + + +def test_find_image_mask_pairs_found_not_all(tmp_path: Path): + """Test this is a no-op if we can't find all of the mask<>image pairs""" + granule_id = "L1C_T45TXF_A038726_20221121T050115" + granule_prefix = tmp_path / f"{granule_id}.SAFE" / "GRANULE" / granule_id + + mask_data = np.zeros((8, 1, 1), dtype="uint8") + + expected_images_masks = [] + for band in list(SPECTRAL_BANDS)[:2]: + expected_images_masks.append(make_fake_s2_granule(granule_prefix, band, mask_data)) + + # Make sure we find all of the expected paths + image_mask_pairs = find_image_mask_pairs(tmp_path, SPECTRAL_BANDS) + assert not image_mask_pairs + + +def test_apply_quality_mask_overwrites_value(tmp_path: Path): + """Ensure image data are set to no data value (0) per mask band""" + # Mask is 2x2 with 4 test cases, + # * (0, 0) ~> unmasked + # * (0, 1) ~> lost MSI packet + # * (1, 0) ~> degraded MSI packet + # * (1, 1) ~> lost + degraded MSI packet (both set) + mask_data = np.zeros((8, 2, 2), dtype="uint8") + mask_data[2, 0, 1] = 1 + mask_data[3, 1, 0] = 1 + mask_data[2, 1, 1] = 1 + mask_data[3, 1, 1] = 1 + assert mask_data.sum() == 4 + + granule_id = "L1C_T45TXF_A038726_20221121T050115" + granule_prefix = tmp_path / f"{granule_id}.SAFE" / "GRANULE" / granule_id + + image, mask = make_fake_s2_granule(granule_prefix, "B02", mask_data) + + apply_quality_mask(image, mask) + + # Check data ~> image should have 0s in 3 pixel locations + with rasterio.open(image) as src: + image_data = src.read(1) + + np.testing.assert_array_equal( + image_data, + np.array([[1, 0], [0, 0]], dtype="uint16"), + )