Merge pull request #647 from wrangleworks/Split-Wildcard-Bug

Split wildcard bug
wrangleworks · Jan 22, 2025 · 6c2735a · 6c2735a
2 parents f2449b6 + e351b40
commit 6c2735a
Show file tree

Hide file tree

Showing 2 changed files with 75 additions and 22 deletions.
diff --git a/tests/recipes/wrangles/test_split.py b/tests/recipes/wrangles/test_split.py
@@ -552,6 +552,25 @@ def test_split_text_regex_case_insensitive(self):
             df['col1'][1] == ["1","2"]
         )
 
+    def test_split_text_wildcard_list_output(self):
+        """
+        Test split.text with a wildcard list for output
+        """
+        df = wrangles.recipe.run(
+            r"""
+            wrangles:
+                - split.text:
+                    input: col1
+                    output: 
+                      - out*
+                    char: ' '
+            """,
+            dataframe=pd.DataFrame({
+                'col1': ['This is a string that will be split']
+            })
+        )
+        assert len(df.columns.to_list()) == 9 and df.iloc[0]['out4'] == 'string'
+
 
 class TestSplitList:
     """
@@ -630,6 +649,41 @@ def test_split_list_where(self):
             df['Col2'].to_list() == ['', 'Mundo!', 'Monde!']
         )
 
+    def test_split_list_wildcard(self):
+        """
+        Test the split.list function using a wildcard output
+        """
+        df = wrangles.recipe.run(
+            """
+            wrangles:
+                - split.list:
+                    input: Col
+                    output: Col*
+            """,
+            dataframe=pd.DataFrame({
+                'Col': [["Hello", "Wrangles!", "and", "World!"]]
+            })
+        )
+        assert len(df.columns.to_list()) == 5 and df['Col2'][0] == 'Wrangles!'
+
+    def test_split_list_wildcard_list(self):
+        """
+        Test the split.list function using a
+        wildcard output as a list
+        """
+        df = wrangles.recipe.run(
+            """
+            wrangles:
+                - split.list:
+                    input: Col
+                    output: 
+                      - Col*
+            """,
+            dataframe=pd.DataFrame({
+                'Col': [["Hello", "Wrangles!", "and", "World!"]]
+            })
+        )
+        assert len(df.columns.to_list()) == 5 and df['Col2'][0] == 'Wrangles!'
 
 
 class TestSplitDictionary:

diff --git a/wrangles/recipe_wrangles/split.py b/wrangles/recipe_wrangles/split.py
@@ -139,16 +139,14 @@ def list(df: _pd.DataFrame, input: str, output: _Union[str, _list]) -> _pd.DataF
         for x in results
     ]
 
-    if isinstance(output, str) and '*' in output:
-        # If user has provided a wildcard for the column name
-        # then use that with an incrementing index
-        output_headers = []
-        for i in range(1, len(results[0]) + 1):
-            output_headers.append(output.replace('*', str(i)))
+    # Handle wildcard cases and column assignment
+    if (isinstance(output, str) and '*' in output) or (isinstance(output, _list) and len(output) == 1 and '*' in output[0]):
+        # Use the wildcard pattern for generating output headers
+        wildcard_template = output if isinstance(output, str) else output[0]
+        output_headers = [wildcard_template.replace('*', str(i)) for i in range(1, len(results[0]) + 1)]
         df[output_headers] = results
-
     else:
-        # Else they should have provided a list for all the output columns
+        # Direct assignment for single column
         df[output] = results
 
     return df
@@ -230,26 +228,27 @@ def text(
         else:
             pad = False
 
+    # Determine the output_length based on the type and length of output
+    # output_length = len(output) if isinstance(output, _list) and len(output) > 1 else None
+
+    # Perform the split operation
     results = _format.split(
         df[input].astype(str).values,
-        output_length = len(output) if isinstance(output, _list) else None,
-        split_char = char,
-        pad = pad,
-        inclusive = inclusive,
-        element = element
+        output_length=len(output) if isinstance(output, _list) and len(output) > 1 else None,
+        split_char=char,
+        pad=pad,
+        inclusive=inclusive,
+        element=element
     )
 
-    if isinstance(output, str) and '*' in output:
-        # If user has entered a wildcard in the output column name
-        # then add results to that with an incrementing index for each column
-        # column * -> column 1, column 2, column 3...
-        output_headers = []
-        for i in range(1, len(results[0]) + 1):
-            output_headers.append(output.replace('*', str(i)))
+    # Handle wildcard cases and column assignment
+    if (isinstance(output, str) and '*' in output) or (isinstance(output, _list) and len(output) == 1 and '*' in output[0]):
+        # Use the wildcard pattern for generating output headers
+        wildcard_template = output if isinstance(output, str) else output[0]
+        output_headers = [wildcard_template.replace('*', str(i)) for i in range(1, len(results[0]) + 1)]
         df[output_headers] = results
-
     else:
-        # User has given a single column - return as a list within that column
+        # Direct assignment for single column
         df[output] = results
 
     return df