apache · skanderboudawara · Dec 27, 2024 · Dec 27, 2024 · Dec 27, 2024 · Dec 27, 2024
diff --git a/python/pyspark/errors/error-conditions.json b/python/pyspark/errors/error-conditions.json
@@ -164,6 +164,11 @@
       "The value <provider> does not represent a correct collation provider. Supported providers are: [<supportedProviders>]."
     ]
   },
+  "COLUMN_IN_DICT" : {
+    "message": [
+      "`<func_name>` does not allow a Column in a dict."
+    ]
+  },
   "COLUMN_IN_LIST": {
     "message": [
       "`<func_name>` does not allow a Column in a list."

diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py
@@ -257,8 +257,9 @@ def col(col: str) -> Column:
 column = col
 
 
-def lit(col: Any) -> Column:
+def lit(col: Any, to_struct: bool = False) -> Column:
     from pyspark.sql.connect.column import Column as ConnectColumn
+    from itertools import chain
 
     if isinstance(col, Column):
         return col
@@ -276,6 +277,16 @@ def lit(col: Any) -> Column:
                 messageParameters={"dtype": col.dtype.name},
             )
         return array(*[lit(c) for c in col]).cast(ArrayType(dt))
+    elif isinstance(col, dict):
+        if any(isinstance(c, Column) for c in col.values()):
+            raise PySparkValueError(
+                errorClass="COLUMN_IN_DICT", messageParameters={"func_name": "lit"}
+            )
+        # Convert to struct or map based on the parameter `to_struct`
+        if to_struct:
+            return struct(*[lit(value).alias(key) for key, value in col.items()])
+        else:
+            return create_map(*[lit(x) for x in chain(*col.items())])
     return ConnectColumn(LiteralExpression._from_value(col))
 
 

diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
@@ -38,6 +38,7 @@
     Union,
     ValuesView,
 )
+from itertools import chain
 
 from pyspark.errors import PySparkTypeError, PySparkValueError
 from pyspark.errors.utils import _with_origin
@@ -153,7 +154,7 @@ def _options_to_str(options: Optional[Mapping[str, Any]] = None) -> Mapping[str,
 
 
 @_try_remote_functions
-def lit(col: Any) -> Column:
+def lit(col: Any, to_struct: bool = False) -> Column:
     """
     Creates a :class:`~pyspark.sql.Column` of literal value.
 
@@ -164,12 +165,17 @@ def lit(col: Any) -> Column:
 
     Parameters
     ----------
-    col : :class:`~pyspark.sql.Column`, str, int, float, bool or list, NumPy literals or ndarray.
-        the value to make it as a PySpark literal. If a column is passed,
+    col : :class:`~pyspark.sql.Column`, str, int, float, bool or list, NumPy literals, ndarray
+        or dict. the value to make it as a PySpark literal. If a column is passed,
         it returns the column as is.
 
-        .. versionchanged:: 3.4.0
+    to_struct: bool, optional, default False
+        If True, the column will be converted to a struct column. If False, the column will be
+        converted to a map column. Default is False. only has an effect when col is a dict.
+
+        .. versionchanged:: 4.0.0
             Since 3.4.0, it supports the list type.
+            Since 4.0.0, it supports the dict type for the creation of a map.
 
     Returns
     -------
@@ -253,6 +259,30 @@ def lit(col: Any) -> Column:
     +------------------+-------+-----------------+--------------------+
     |     [true, false]|     []|       [1.5, 0.1]|           [a, b, c]|
     +------------------+-------+-----------------+--------------------+
+
+    Example 7: Creating a literal column as a map from a dict.
+
+    >>> import pyspark.sql.functions as sf
+    >>> spark.range(1).select(
+    ...    sf.lit({"a": 1, "b": 2}).alias("map_col")
+    ... ).show() # doctest: +SKIP
+    +----------------+
+    |         map_col|
+    +----------------+
+    |{a -> 1, b -> 2}|
+    +----------------+
+
+    Example 8: Creating a literal column as a struct from a dict.
+
+    >>> import pyspark.sql.functions as sf
+    >>> spark.range(1).select(
+    ...    sf.lit({"a": 1, "b": 2}, true).alias("struct_col")
+    ... ).show() # doctest: +SKIP
+    +----------------+
+    |      struct_col|
+    +----------------+
+    |{a -> 1, b -> 2}|
+    +----------------+
     """
     if isinstance(col, Column):
         return col
@@ -262,6 +292,18 @@ def lit(col: Any) -> Column:
                 errorClass="COLUMN_IN_LIST", messageParameters={"func_name": "lit"}
             )
         return array(*[lit(item) for item in col])
+    elif isinstance(col, dict):
+        # Skip checking if the keys are column as Columns are not hashable
+        # and cannot be used as dict keys in the first place.
+        if any(isinstance(value, Column) for value in col.values()):
+            raise PySparkValueError(
+                errorClass="COLUMN_IN_DICT", messageParameters={"func_name": "lit"}
+            )
+        # Convert to struct or map based on the parameter `to_struct`
+        if to_struct:
+            return struct(*[lit(value).alias(key) for key, value in col.items()])
+        else:
+            return create_map(*[lit(x) for x in chain(*col.items())])
     elif _has_numpy:
         if isinstance(col, np.generic):
             dt = _from_numpy_type(col.dtype)

diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
@@ -1316,6 +1316,44 @@ def test_lit_list(self):
             messageParameters={"func_name": "lit"},
         )
 
+    # SPARK-48665: added support for dict type
+    def test_lit_dict(self):
+        test_dict = {"a": 1, "b": 2}
+        actual = self.spark.range(1).select(F.lit(test_dict, to_struct=True)).first()[0]
+        # Convert struct return to dict
+        actual = actual.asDict()
+        self.assertEqual(actual, test_dict)
+
+        test_dict = {"a": 1, "b": 2}
+        actual = self.spark.range(1).select(F.lit(test_dict)).first()[0]
+        self.assertEqual(actual, test_dict)
+
+        test_dict = {"a": {"1": 1}, "b": {"2": 2}}
+        actual = self.spark.range(1).select(F.lit(test_dict)).first()[0]
+        self.assertEqual(actual, test_dict)
+
+        with self.sql_conf({"spark.sql.ansi.enabled": False}):
+            test_dict = {"a": 1, "b": "2", "c": None}
+            expected_dict = {"a": "1", "b": "2", "c": None}
+            actual = self.spark.range(1).select(F.lit(test_dict)).first()[0]
+            self.assertEqual(actual, expected_dict)
+
+        df = self.spark.range(10)
+        dicts = [
+            {"a": df.id},
+            {"a": {"b": df.id}},
+        ]
+
+        for d in dicts:
+            with self.assertRaises(PySparkValueError) as pe:
+                F.lit(d)
+
+            self.check_error(
+                exception=pe.exception,
+                errorClass="COLUMN_IN_DICT",
+                messageParameters={"func_name": "lit"},
+            )
+
     # Test added for SPARK-39832; change Python API to accept both col & str as input
     def test_regexp_replace(self):
         df = self.spark.createDataFrame(