nasa-fornax · troyraen · Feb 26, 2026 · Jul 10, 2025 · Jul 16, 2025 · Jul 29, 2025
diff --git a/spectral_cubes/README.md b/spectral_cubes/README.md
@@ -0,0 +1,7 @@
+# Analytical data search in the cloud: finding jets in JWST spectral cubes
+
+Not all archival search queries can be answered with metadata alone. In this use case, we start from an intimidating user query: "I want to find JWST spectral image cubes containing Fe II emission from jets launched by young stellar objects." In the course of answering this query, we learn to think carefully about the order of operations for coordinate searches on large numbers of targets, to perform parts of this search locally when there are too many targets for archive services to handle, to parallelize our data processing, to load data into memory instead of storage, and to load data from the cloud instead of on-premise storage.
+
+Although this tutorial focuses on JWST spectral cubes, it is intended to teach general strategies that will be useful for answering other data-intensive search queries unrelated to spectral cubes, in any of the Fornax archives. We design some simple algorithms to search for extended emission in spectral cubes, but these algorithms are designed to be replaceable by more sophisticated tools tailored to your science case.
+
+Note: the notebook (emission_search_cubes.md) requires the files in the code_src/ directory in order to run.
diff --git a/spectral_cubes/code_src/find_extended_emission.py b/spectral_cubes/code_src/find_extended_emission.py
@@ -0,0 +1,159 @@
+import numpy as np
+from scipy.ndimage import label
+from scipy.signal import find_peaks
+import pandas as pd
+
+def extension_vs_wavebin(data):
+    """
+    Generate a 1D list of candidate emission spatial extents for each 
+    wavelength bin of a given spectral cube. Replace the contents of
+    this function withyour favorite image segmentation algorithm. The desired
+    output is any 1D python list where a peak in some way corresponds to an
+    increased chance of a jet being present at the corresponding wavebin.
+
+    Parameters
+    ----------
+    data : np.ndarray
+        A spectral cube as a numpy.ndarray,
+        with first axis wavelength,
+        second and third axes spatial.
+
+    Returns
+    ----------
+    blobs : list[float]
+        List of length equal to the number of wavebins, with values
+        corresponding to the fraction of pixels from that slice above
+        a brightness threshold.
+
+    """
+
+    # Let's instantiate a list "blobs".
+    # This will contain a list of the sizes (as a fraction of non-NaN pixels in each slice)
+    # of contiguous regions ("blobs") of pixels brighter than some threshold in each slice.
+    blobs = []
+
+    for i in range(0, data.shape[0]): # For every image slice in the spectral cube:
+
+        # Grab the image for this slice.
+        dataslice = data[i]
+
+        # Set a threshold flux value above which a pixel is considered to not be part of the background.
+        # Here, we base this threshold on the median brightness of all pixels within +/- 10 slices of the current slice.
+        # This is pretty arbitrary: demonstration purposes only!
+        start = max(0, i-10)
+        end = min(data.shape[0], i+10)
+
+        # Check for the case where all data is NaN
+        if np.all(np.isnan(data[start:end])):
+            blobs.append(0.)
+            continue  # Skip the rest of this loop
+
+        threshold = 3.0*np.nanmedian(data[start:end])
+
+        # Define a mask for this slice,
+        # where a pixel has a value of True if it exceeds the threshold,
+        # and a value of False if it does not.
+        bright_pixels = dataslice > threshold
+
+        # Use scipy.ndimage.label to define contiguous regions (blobs) of these bright pixels.
+        labeled_array, num_features = label(bright_pixels)
+
+        # In labeled_array, pixels with a value of 0 did not meet the bright_pixels condition; 
+        # these are background pixels.
+        # Any other labels (1, 2, 3, and so on) represent different contiguous regions of pixels
+        # meeting the bright_pixels condition. These are our blobs.
+
+        # Get the number of pixels in each blob, in order of label (0, 1, 2, 3...).
+        sizes = np.bincount(labeled_array.ravel())
+
+        # If all the pixels are labeled 0, then there is no candidate jet feature in this slice.
+        # So we'll assign 0 to the size of the jet feature candidate in this slice.
+        if len(sizes) <= 1:
+            blobs.append(0.)
+
+        # If there's more than one label...
+        if len(sizes) > 1:
+
+            #...the background pixels (label 0) are first in the list we made with np.bincount.
+            # The rest are blobs.
+            # Let's get the size of the biggest blob: this is probably our best jet candidate.
+            jet_size = np.max(sizes[1:])
+
+            # Get the number of total pixels in the slice.
+            num_pixels = len(dataslice.flatten())
+
+            # Get the number of pixels in the slice that are not not NaN.
+            num_goodpixels = len(dataslice[~np.isnan(dataslice)].flatten())
+
+            # What fraction of the non-NaN pixels are occupied by the jet candidate?
+            jet_fraction = jet_size / num_goodpixels
+
+            # To improve the likelihood that this is a jet,
+            # let's require that the jet candidate be at least 10 pixels in size,
+            # but occupy less than half of the slice.
+            if (jet_size >= 10) and (jet_fraction < 0.5):
+                blobs.append(jet_fraction)  # Add the jet candidate size fraction for this slice to the list.
+            else:  # Otherwise, assume that we don't have a good jet candidate.
+                blobs.append(0.)
+
+    return blobs
+
+
+def detect_spikes(blobs, pos, sig=3.0, detect_halfwidth=10, match_halfwidth=2):
+    """
+    Detect spikes in the array of candidate jet sizes (or probability, etc.) vs. wavelength.
+    Replace the contents of this function with your favorite outlier detection algorithm.
+
+    Parameters
+    ----------
+    blobs : list[float]
+        The 1D python list returned by extension_vs_wavebin.
+        List of length equal to the number of wavebins, with values
+        corresponding to the fraction of pixels from that slice above
+        a brightness threshold.
+    pos : int
+        The expected wavebin index of a particular spectral line.
+    sig : float
+        Scaling factor used to set the detection threshold.
+    detect_halfwidth : int
+        The number of pixels +/- offset from pos to run peak detector on.
+    match_halfwidth : int
+        The max number of pixels away from pos that a peak can be centered on
+        and still have this function yield a detection at pos.
+
+    Returns
+    ----------
+    bool
+        True (if spike is detected near pos) or False (if not)
+    """
+
+    # Excerpt a range of +/- 10 pixels from the expected wavelength of the Fe II line.
+    offset = detect_halfwidth
+    start = max(0, pos - offset)
+    end = min(len(blobs), pos + offset + 1)
+    excerpt = np.asarray(blobs[start:end])
+
+    # Check for the case where all non-zero excerpted data is NaN
+    if np.all(np.isnan(excerpt[excerpt!=0])):
+        return False
+
+    # Set a detection threshold equal to the standard deviation of this excerpt,
+    # multiplied by a scaling factor.
+    threshold = np.nanstd(excerpt[excerpt!=0])*sig
+
+    # Find peaks with scipy.signal.find_peaks
+    peaks, properties = find_peaks(excerpt, prominence=threshold)
+
+    # If any of the detected peaks are within a couple pixels of the expected position of the Fe II line, 
+    # then return True.
+    # Otherwise, return False.
+    return any((pos-match_halfwidth <= peak+(pos-offset) <= pos+match_halfwidth) for peak in peaks)
+
+
+def not_none(x):
+    """Check that x is not None and is not pd.NA"""
+    try:
+        return x is not None and not pd.isna(x)
+    except TypeError:
+        # For types that can't be checked with pd.isna
+        return x is not None
diff --git a/spectral_cubes/code_src/parse_polygon.py b/spectral_cubes/code_src/parse_polygon.py
@@ -0,0 +1,41 @@
+import numpy as np
+
+def parse_polygon(s_region):
+    """
+    Extract coordinates from an s_region string.
+
+    Parameters
+    ----------
+    s_region : string - s_region from cross-mission archive
+               As written, this s_region must be a single polygon,
+               which is the case for all JWST and SOFIA observations,
+               but not all missions in the Fornax archives.
+               For example:
+               POLYGON 11.885732601 -25.291345071 11.885732601
+               -25.289267294 11.888030654 -25.289267294 11.888030654
+               -25.291345071
+               (or "POLYGON ICRS")
+
+    Returns
+    ----------
+    coord_array : np.ndarray
+        Parsed representation of the s_region. Last vertex is identical
+        to first vertex.
+    """
+
+    if 'POLYGON ICRS' in s_region:
+        coords = list(map(float, s_region.replace("POLYGON ICRS", "").strip().split()))
+    elif 'POLYGON' in s_region:
+        coords = list(map(float, s_region.replace("POLYGON", "").strip().split()))
+
+    # Check if the polygon is closed, with the last vertex identical to the first vertex.
+    # If not, append a copy of the first vertex to the end of the list.
+    if (coords[-2], coords[-1]) != (coords[0], coords[1]):
+        coords.append(coords[0])
+        coords.append(coords[1])
+
+    # Create a numpy array listing a coordinate tuple for each vertex
+    coord_array = np.array([(coords[i], coords[i + 1]) for i in range(0, len(coords), 2)])
+
+    return(coord_array)
+