Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,7 @@ dmypy.json
examples/evaluations/text_generation/final_metrics.txt

#Data generated by langfair data_loader module
langfair/data/*
langfair/data/*

# Personal testing notebooks
TESTINGPURU.ipynb
3 changes: 2 additions & 1 deletion langfair/auto/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
Protected_Attributes = {
"race": ["white", "black", "asian", "hispanic"],
"gender": ["male", "female"],
"sexual_orientation": ["heterosexual", "gay", "lesbian", "bisexual"],
}


Expand Down Expand Up @@ -179,7 +180,7 @@ async def evaluate(
print("------------------------------------------")
# 1. Check for Fairness Through Unawareness FTU
# Parse prompts for protected attribute words
protected_words = {"race": 0, "gender": 0}
protected_words = {attr: 0 for attr in Protected_Attributes}
total_protected_words = 0

for attribute in protected_words.keys():
Expand Down
28 changes: 28 additions & 0 deletions langfair/constants/word_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
This file includes word lists for the following 4 categories:
- Race (Asian, Hispanic, and White): Most common names for each race (Garg et al. 2018)
- Gender (Female, Male): Gender specific words - such as woman, man etc. - for each gender (Bolukbasi et al. 2016)
- Sexual Orientation (Gay, Straight, Bisexual, etc.): Terms influenced from HRC Glossary of Terms (https://www.hrc.org/resources/glossary-of-terms)
- Adjectives: List of adjectives (Garg et al. 2018)
- Professions: List of professions (Bolukbasi et al. 2016)

Expand Down Expand Up @@ -173,6 +174,33 @@
]


################################################################################
# Define sexual orientation word lists
################################################################################
# For sexual orientation, string search is done (similar to race)
SEXUAL_ORIENTATION_WORDS_NOT_REQUIRING_CONTEXT: List[str] = [
"homosexual",
"heterosexual",
"bisexual",
"lesbian",
"queer",
"lgbtq",
"lgbt",
"homosexuals",
"heterosexuals",
"bisexuals",
"lesbians",
"nonheterosexual",
"non-heterosexual",
]

SEXUAL_ORIENTATION_WORDS_REQUIRING_CONTEXT: List[str] = [
"gay",
"straight",
"pride",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please check that these align with the context/person words that exist in the word list. Have you tested substitutions with these words? It would be helpful to have screenshots to show how these substitutions are handled

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed "pride": when checked against the existing PERSON_WORDS list, "pride" generates entirely unnatural token-pairs such as "pride accountant", "pride actress", "pride manager" etc. None of these phrases appear in real text so it was effectively matching nothing. "pride" has been removed from SEXUAL_ORIENTATION_WORDS_REQUIRING_CONTEXT.

"gay" and "straight" both align well: they produce natural token-pairs with PERSON_WORDS, e.g. "gay man", "gay employee", "gay actor", "straight woman", "straight person". This mirrors how "black" and "white" behave in the existing RACE_WORDS_REQUIRING_CONTEXT. They only match when followed by a person noun, which avoids false positives from unrelated uses of the word.

Full substitution output across a wide variety of cases is shown in my reply to your general comment below.

]


# Stereotype groups
STEREOTYPE_GROUP_NUMBER_MAPPING = {
"Homosexuals": "The homosexual",
Expand Down
125 changes: 101 additions & 24 deletions langfair/generator/counterfactual.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
PERSON_WORDS,
RACE_WORDS_NOT_REQUIRING_CONTEXT,
RACE_WORDS_REQUIRING_CONTEXT,
SEXUAL_ORIENTATION_WORDS_NOT_REQUIRING_CONTEXT,
SEXUAL_ORIENTATION_WORDS_REQUIRING_CONTEXT,
)
from langfair.generator.generator import ResponseGenerator
from langfair.utils.display import (
Expand Down Expand Up @@ -60,6 +62,22 @@
) # Extend to include words that indicate race whether or not a person word follows
STRICT_RACE_WORDS = list(set(STRICT_RACE_WORDS))
ALL_RACE_WORDS = RACE_WORDS_REQUIRING_CONTEXT + RACE_WORDS_NOT_REQUIRING_CONTEXT

STRICT_SEXUAL_ORIENTATION_WORDS = []
for sow in (
SEXUAL_ORIENTATION_WORDS_REQUIRING_CONTEXT
): # Include token-pairs that indicate reference to the sexual orientation of a person
for pw in PERSON_WORDS:
STRICT_SEXUAL_ORIENTATION_WORDS.append(sow + " " + pw)

STRICT_SEXUAL_ORIENTATION_WORDS.extend(
SEXUAL_ORIENTATION_WORDS_NOT_REQUIRING_CONTEXT
) # Extend to include words that indicate sexual orientation whether or not a person word follows
STRICT_SEXUAL_ORIENTATION_WORDS = list(set(STRICT_SEXUAL_ORIENTATION_WORDS))
ALL_SEXUAL_ORIENTATION_WORDS = (
SEXUAL_ORIENTATION_WORDS_REQUIRING_CONTEXT
+ SEXUAL_ORIENTATION_WORDS_NOT_REQUIRING_CONTEXT
)
warnings.filterwarnings("ignore", category=DeprecationWarning)


Expand Down Expand Up @@ -105,17 +123,20 @@ def __init__(
self.attribute_to_word_lists = {
"race": ALL_RACE_WORDS,
"gender": ALL_GENDER_WORDS,
"sexual_orientation": ALL_SEXUAL_ORIENTATION_WORDS,
}
self.attribute_to_ref_dicts = {"gender": GENDER_TO_WORD_LISTS}
self.gender_to_word_lists = GENDER_TO_WORD_LISTS
self.cf_gender_mapping = GENDER_MAPPING
self.gender_neutral_mapping = GENDER_NEUTRAL_MAPPING
self.all_race_words = ALL_RACE_WORDS
self.strict_race_words = STRICT_RACE_WORDS
self.strict_sexual_orientation_words = STRICT_SEXUAL_ORIENTATION_WORDS
self.detokenizer = sacremoses.MosesDetokenizer("en")
self.group_mapping = {
"gender": ["male", "female"],
"race": ["white", "black", "hispanic", "asian"],
"sexual_orientation": ["heterosexual", "gay", "lesbian", "bisexual"],
}

try:
Expand Down Expand Up @@ -145,7 +166,7 @@ async def estimate_token_cost(
tiktoken_model_name: str
The name of the OpenAI model to use for token counting.

attribute: str, either 'gender' or 'race'
attribute: str, either 'gender', 'race', or 'sexual_orientation'
Specifies attribute to be used for counterfactual generation

example_responses : list of strings, default=None
Expand Down Expand Up @@ -196,9 +217,9 @@ def parse_texts(
texts : list of strings
A list of texts to be parsed for protected attribute words

attribute : {'race','gender'}, default=None
Specifies what to parse for among race words and gender words. Must be specified
if custom_list is None
attribute : {'race','gender','sexual_orientation'}, default=None
Specifies what to parse for among race words, gender words, and sexual orientation
words. Must be specified if custom_list is None

custom_list : List[str], default=None
Custom list of tokens to use for parsing prompts. Must be provided if attribute is None.
Expand Down Expand Up @@ -233,9 +254,9 @@ def create_prompts(
prompts : List[str]
A list of prompts on which counterfactual substitution and response generation will be done

attribute : {'gender', 'race'}, default=None
Specifies whether to use race or gender for counterfactual substitution. Must be provided if
custom_dict is None.
attribute : {'gender', 'race', 'sexual_orientation'}, default=None
Specifies whether to use race, gender, or sexual orientation for counterfactual
substitution. Must be provided if custom_dict is None.

custom_dict : Dict[str, List[str]], default=None
A dictionary containing corresponding lists of tokens for counterfactual substitution. Keys
Expand Down Expand Up @@ -267,6 +288,14 @@ def create_prompts(
for race in self.group_mapping[attribute]
}

elif attribute == "sexual_orientation":
prompts_dict = {
orientation + "_prompt": self._counterfactual_sub_sexual_orientation(
texts=prompts, target_orientation=orientation
)
for orientation in self.group_mapping[attribute]
}

else:
if custom_dict:
ref_dict = custom_dict
Expand All @@ -292,30 +321,36 @@ def neutralize_tokens(
self, texts: List[str], attribute: str = "gender"
) -> List[str]:
"""
Neutralize gender and race words contained in a list of texts. Replaces gender words with a
gender-neutral equivalent and race words with "[MASK]".
Neutralize gender, race, and sexual orientation words contained in a list of texts.
Replaces gender words with a gender-neutral equivalent and race or sexual orientation
words with "[MASK]".

Parameters
----------
texts : List[str]
A list of texts on which gender or race neutralization will occur
A list of texts on which gender, race, or sexual orientation neutralization will occur

attribute : {'gender', 'race'}, default='gender'
Specifies whether to use race or gender for neutralization
attribute : {'gender', 'race', 'sexual_orientation'}, default='gender'
Specifies whether to use race, gender, or sexual orientation for neutralization

Returns
-------
list
List of texts neutralized for race or gender
List of texts neutralized for race, gender, or sexual orientation
"""
assert attribute in [
"gender",
"race",
], "Only gender and race attributes are supported."
"sexual_orientation",
], "Only gender, race, and sexual_orientation attributes are supported."
if attribute == "gender":
return [self._neutralize_gender(text) for text in texts]
elif attribute == "race":
return self._counterfactual_sub_race(texts=texts, target_race="[MASK]")
elif attribute == "sexual_orientation":
return self._counterfactual_sub_sexual_orientation(
texts=texts, target_orientation="[MASK]"
)

async def generate_responses(
self,
Expand All @@ -335,9 +370,9 @@ async def generate_responses(
prompts : list of strings
A list of prompts on which counterfactual substitution and response generation will be done

attribute : {'gender', 'race'}, default=None
Specifies whether to use race or gender for counterfactual substitution. Must be provided if
custom_dict is None.
attribute : {'gender', 'race', 'sexual_orientation'}, default=None
Specifies whether to use race, gender, or sexual orientation for counterfactual
substitution. Must be provided if custom_dict is None.

custom_dict : Dict[str, List[str]], default=None
A dictionary containing corresponding lists of tokens for counterfactual substitution. Keys
Expand Down Expand Up @@ -457,9 +492,9 @@ def check_ftu(
prompts : list of strings
A list of prompts to be parsed for protected attribute words

attribute : {'race','gender'}, default=None
Specifies what to parse for among race words and gender words. Must be specified
if custom_list is None
attribute : {'race','gender','sexual_orientation'}, default=None
Specifies what to parse for among race words, gender words, and sexual orientation
words. Must be specified if custom_list is None

custom_list : List[str], default=None
Custom list of tokens to use for parsing prompts. Must be provided if attribute is None.
Expand Down Expand Up @@ -599,6 +634,8 @@ def _token_parser(
return self._get_race_subsequences(text)
elif attribute == "gender":
return list(set(tokens) & set(self.attribute_to_word_lists[attribute]))
elif attribute == "sexual_orientation":
return self._get_sexual_orientation_subsequences(text)
elif custom_list:
return list(set(tokens) & set(custom_list))

Expand Down Expand Up @@ -673,6 +710,42 @@ def _replace_race(text: str, target_race: str) -> str:
seq = seq.replace(subseq, race_replacement_mapping[subseq])
return seq

@staticmethod
def _get_sexual_orientation_subsequences(text: str) -> List[str]:
"""Used to check for sexual orientation string sequences"""
seq = text.lower()
return [subseq for subseq in STRICT_SEXUAL_ORIENTATION_WORDS if subseq in seq]

def _counterfactual_sub_sexual_orientation(
self,
texts: List[str],
target_orientation: str,
) -> List[str]:
"""Implements counterfactual substitution for sexual orientation"""
new_texts = []
for text in texts:
new_text = self._replace_sexual_orientation(text, target_orientation)
new_texts.append(new_text)
return new_texts

@staticmethod
def _replace_sexual_orientation(text: str, target_orientation: str) -> str:
"""Replaces sexual orientation words with a target orientation word"""
seq = text.lower()
orientation_replacement_mapping = {}
for sow in SEXUAL_ORIENTATION_WORDS_REQUIRING_CONTEXT: # Include token-pairs that indicate reference to the sexual orientation of a person
for pw in PERSON_WORDS:
key = sow + " " + pw
orientation_replacement_mapping[key] = target_orientation + " " + pw
for sow in SEXUAL_ORIENTATION_WORDS_NOT_REQUIRING_CONTEXT:
orientation_replacement_mapping[sow] = target_orientation

# Replace longest matches first to avoid partial replacements
for subseq in sorted(STRICT_SEXUAL_ORIENTATION_WORDS, key=len, reverse=True):
if subseq in seq:
seq = seq.replace(subseq, orientation_replacement_mapping[subseq])
return seq

@staticmethod
def _validate_attributes(
attribute: Optional[str] = None,
Expand All @@ -683,14 +756,18 @@ def _validate_attributes(
if for_parsing:
if custom_list and attribute:
raise ValueError("Either custom_list or attribute must be None.")
if not (custom_list or attribute in ["race", "gender"]):
if not (
custom_list or attribute in ["race", "gender", "sexual_orientation"]
):
raise ValueError(
"If custom_list is None, attribute must be 'race' or 'gender'."
"If custom_list is None, attribute must be 'race', 'gender', or 'sexual_orientation'."
)
else:
if custom_dict and attribute:
raise ValueError("Either custom_dict or attribute must be None.")
if not (custom_dict or attribute in ["race", "gender"]):
if not (
custom_dict or attribute in ["race", "gender", "sexual_orientation"]
):
raise ValueError(
"If custom_dict is None, attribute must be 'race' or 'gender'."
"If custom_dict is None, attribute must be 'race', 'gender', or 'sexual_orientation'."
)
Loading