Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 103 additions & 6 deletions beast/tools/convert_hdf5_to_fits.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@
import tables


def st_file(file_name):
def st_file(file_name, autotrim=False):
"""Converts HDF5 files (specifically from Ben Williams) into FITS format.

Parameters
----------
file_name : str (default=None)
Name of HDF5 file (will also be name of output file, with different extension)
Name of HDF5 file (will also be name of output file, with .phot.hdf5
replaced with .st.fits)
autotrim : bool, optional (default=False)
Whether to automatically trim column names that are too long for FITS files,
and will therefore make this function crash otherwise
Comment on lines +14 to +16
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this function will crash if this variable is false, why not make the default true? And issue a statement of what is happening? This sounds like a better solution than having a known crash.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When this docstring means is that if the already-extisting st_file function will crash if the columns names are too long.

So shouldn't crash is this is false. Rather, if false, then the function replace_largest_common_substring simply won't get called. In that way, the behaviour of all existing code is unchanged. But now this exists for uses to enable if they want.


Returns
-------
Expand Down Expand Up @@ -71,23 +75,116 @@ def st_file(file_name):

# save everything to astropy table in a fits file
print("making astropy table")

new_table = Table()

for tag in set0:
new_table[tag.upper()] = np.array(data_dict[tag])
for tag in set1:
new_table[tag.upper()] = np.array(data_dict[tag])
for tag in set2:
new_table[tag.upper()] = np.array(data_dict[tag])

# Identify and replace too-long column names, if requested
if autotrim:
set0_trim, set1_trim, set2_trim = replace_largest_common_substring(
set0, set1, set2
)
Comment on lines +86 to +90
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am feeling dim right now. Where are these variables used? I don't see anything before the function ends, so it is not clear to me where the new set0_trim, etc. variables are used to update the column names.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They're used in lines 91-99, to update the column names with the new, shorter, programmatically-generated column names produced by replace_largest_common_substring (if that's found to be necessary).


# write to file
print("saving to fits file")
new_table.write(
file_name.replace("phot.hdf5", "st.fits"), format="fits", overwrite=True
file_name.replace(".phot.hdf5", ".st.fits"), format="fits", overwrite=True
)

f.close()


def replace_largest_common_substring(set0, set1, set2):
"""Finds the largest contiguous substring common to column names too long
for FITS key names

Parameters
----------
set0 : list
set0, as produced by st_file
set1 : list
set1, as produced by st_file
set2 : list
set2, as produced by st_file

Returns
-------
set0 : list
set0, with any necessary tag trimming to allow FITS table to write
set1 : list
set1, with any necessary tag trimming to allow FITS table to write
set2 : list
set2, with any necessary tag trimming to allow FITS table to write
"""

# Find all column names longer than 65 characters
sets = set0 + set1 + set2
indices = np.where(np.array([len(tag) for tag in sets]) > 65)[0]

# If no column names are too long, returns unchanged
if len(indices) == 0:
return set0, set1, set2

# Extract too-long column names
subset = [sets[i] for i in indices]

# Check the shortest possible shared string
shortest_str = min(subset, key=len)
max_len = len(shortest_str)

# Check substrings of decreasing length; break out when longest found
longest = False
for length in range(max_len, 0, -1):
for start in range(0, max_len - length + 1):
candidate = shortest_str[start : start + length]
if all(candidate in s for s in subset):
longest = True
longest_str = candidate
break
if longest:
break

# Raise exception if no fix possible
if not longest:
raise Exception(
"Some column names are too long to write to FITS table, "
+ "but no shared string found to trim"
)

# Find maximum length of column names, and therefore how much trimming is necessary
longest_tag = np.array([len(tag) for tag in sets]).max()
n_char_trim = longest_tag - 65

# If longest shared string isn't enough, raise exception
if (longest_tag - n_char_trim) >= 68:
raise Exception(
"No shared string in column names is long enough to enable "
+ "enough trimming to get column names short enough to allow "
+ "FITS table to write without crashing"
)

# Use this to decide what replacement stirng should be, first checking if we can just replace - and _ characters
new_str_test = longest_str.replace("-", "").replace("_", "")
if (len(longest_str) - len(new_str_test)) >= n_char_trim:
new_str = new_str_test
else:
new_str = longest_str[n_char_trim:]

# Replace all colnames that contain our long shared string with the trimmed verison
set0_trim, set1_trim, set2_trim = set0.copy(), set1.copy(), set2.copy()
for i in range(len(set0)):
set0_trim[i] = set0[i].replace(longest_str, new_str)
for i in range(len(set1)):
set1_trim[i] = set1[i].replace(longest_str, new_str)
for i in range(len(set2)):
set2_trim[i] = set2[i].replace(longest_str, new_str)

# Return trimed column names
return set0_trim, set1_trim, set2_trim


if __name__ == "__main__":
st_file(file_name=None)
Loading