Skip to content

Commit

Permalink
Merge pull request #647 from wrangleworks/Split-Wildcard-Bug
Browse files Browse the repository at this point in the history
Split wildcard bug
  • Loading branch information
ChrisWRWX authored Jan 22, 2025
2 parents f2449b6 + e351b40 commit 6c2735a
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 22 deletions.
54 changes: 54 additions & 0 deletions tests/recipes/wrangles/test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,25 @@ def test_split_text_regex_case_insensitive(self):
df['col1'][1] == ["1","2"]
)

def test_split_text_wildcard_list_output(self):
"""
Test split.text with a wildcard list for output
"""
df = wrangles.recipe.run(
r"""
wrangles:
- split.text:
input: col1
output:
- out*
char: ' '
""",
dataframe=pd.DataFrame({
'col1': ['This is a string that will be split']
})
)
assert len(df.columns.to_list()) == 9 and df.iloc[0]['out4'] == 'string'


class TestSplitList:
"""
Expand Down Expand Up @@ -630,6 +649,41 @@ def test_split_list_where(self):
df['Col2'].to_list() == ['', 'Mundo!', 'Monde!']
)

def test_split_list_wildcard(self):
"""
Test the split.list function using a wildcard output
"""
df = wrangles.recipe.run(
"""
wrangles:
- split.list:
input: Col
output: Col*
""",
dataframe=pd.DataFrame({
'Col': [["Hello", "Wrangles!", "and", "World!"]]
})
)
assert len(df.columns.to_list()) == 5 and df['Col2'][0] == 'Wrangles!'

def test_split_list_wildcard_list(self):
"""
Test the split.list function using a
wildcard output as a list
"""
df = wrangles.recipe.run(
"""
wrangles:
- split.list:
input: Col
output:
- Col*
""",
dataframe=pd.DataFrame({
'Col': [["Hello", "Wrangles!", "and", "World!"]]
})
)
assert len(df.columns.to_list()) == 5 and df['Col2'][0] == 'Wrangles!'


class TestSplitDictionary:
Expand Down
43 changes: 21 additions & 22 deletions wrangles/recipe_wrangles/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,16 +139,14 @@ def list(df: _pd.DataFrame, input: str, output: _Union[str, _list]) -> _pd.DataF
for x in results
]

if isinstance(output, str) and '*' in output:
# If user has provided a wildcard for the column name
# then use that with an incrementing index
output_headers = []
for i in range(1, len(results[0]) + 1):
output_headers.append(output.replace('*', str(i)))
# Handle wildcard cases and column assignment
if (isinstance(output, str) and '*' in output) or (isinstance(output, _list) and len(output) == 1 and '*' in output[0]):
# Use the wildcard pattern for generating output headers
wildcard_template = output if isinstance(output, str) else output[0]
output_headers = [wildcard_template.replace('*', str(i)) for i in range(1, len(results[0]) + 1)]
df[output_headers] = results

else:
# Else they should have provided a list for all the output columns
# Direct assignment for single column
df[output] = results

return df
Expand Down Expand Up @@ -230,26 +228,27 @@ def text(
else:
pad = False

# Determine the output_length based on the type and length of output
# output_length = len(output) if isinstance(output, _list) and len(output) > 1 else None

# Perform the split operation
results = _format.split(
df[input].astype(str).values,
output_length = len(output) if isinstance(output, _list) else None,
split_char = char,
pad = pad,
inclusive = inclusive,
element = element
output_length=len(output) if isinstance(output, _list) and len(output) > 1 else None,
split_char=char,
pad=pad,
inclusive=inclusive,
element=element
)

if isinstance(output, str) and '*' in output:
# If user has entered a wildcard in the output column name
# then add results to that with an incrementing index for each column
# column * -> column 1, column 2, column 3...
output_headers = []
for i in range(1, len(results[0]) + 1):
output_headers.append(output.replace('*', str(i)))
# Handle wildcard cases and column assignment
if (isinstance(output, str) and '*' in output) or (isinstance(output, _list) and len(output) == 1 and '*' in output[0]):
# Use the wildcard pattern for generating output headers
wildcard_template = output if isinstance(output, str) else output[0]
output_headers = [wildcard_template.replace('*', str(i)) for i in range(1, len(results[0]) + 1)]
df[output_headers] = results

else:
# User has given a single column - return as a list within that column
# Direct assignment for single column
df[output] = results

return df
Expand Down

0 comments on commit 6c2735a

Please sign in to comment.