apache · skanderboudawara · Dec 27, 2024 · Dec 27, 2024 · Dec 27, 2024 · Dec 27, 2024
diff --git a/python/pyspark/errors/error-conditions.json b/python/pyspark/errors/error-conditions.json
@@ -164,6 +164,11 @@
       "The value <provider> does not represent a correct collation provider. Supported providers are: [<supportedProviders>]."
     ]
   },
+  "COLUMN_IN_DICT" : {
+    "message": [
+      "`<func_name>` does not allow a Column in a dict."
+    ]
+  },
   "COLUMN_IN_LIST": {
     "message": [
       "`<func_name>` does not allow a Column in a list."

diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py
@@ -70,7 +70,10 @@
     ArrayType,
     StringType,
 )
-from pyspark.sql.utils import enum_to_value as _enum_to_value
+from pyspark.sql.utils import (
+    enum_to_value as _enum_to_value,
+    get_conf as _get_conf,
+)
 
 # The implementation of pandas_udf is embedded in pyspark.sql.function.pandas_udf
 # for code reuse.
@@ -259,6 +262,7 @@ def col(col: str) -> Column:
 
 def lit(col: Any) -> Column:
     from pyspark.sql.connect.column import Column as ConnectColumn
+    from itertools import chain
 
     if isinstance(col, Column):
         return col
@@ -276,6 +280,16 @@ def lit(col: Any) -> Column:
                 messageParameters={"dtype": col.dtype.name},
             )
         return array(*[lit(c) for c in col]).cast(ArrayType(dt))
+    elif isinstance(col, dict):
+        if any(isinstance(c, Column) for c in col.values()):
+            raise PySparkValueError(
+                errorClass="COLUMN_IN_DICT", messageParameters={"func_name": "lit"}
+            )
+        dict_as_struct = _get_conf("spark.sql.pyspark.inferNestedDictAsStruct.enabled")
+        if dict_as_struct and dict_as_struct.lower() == "true":
+            return struct(*[lit(value).alias(key) for key, value in col.items()])
+        else:
+            return create_map(*[lit(x) for x in chain(*col.items())])
     return ConnectColumn(LiteralExpression._from_value(col))
 
 

diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
@@ -38,7 +38,7 @@
     Union,
     ValuesView,
 )
-
+from itertools import chain
 from pyspark.errors import PySparkTypeError, PySparkValueError
 from pyspark.errors.utils import _with_origin
 from pyspark.sql.column import Column
@@ -66,6 +66,7 @@
 from pyspark.sql.utils import (
     to_str as _to_str,
     has_numpy as _has_numpy,
+    get_conf as _get_conf,
     try_remote_functions as _try_remote_functions,
     get_active_spark_context as _get_active_spark_context,
     enum_to_value as _enum_to_value,
@@ -164,12 +165,13 @@ def lit(col: Any) -> Column:
 
     Parameters
     ----------
-    col : :class:`~pyspark.sql.Column`, str, int, float, bool or list, NumPy literals or ndarray.
-        the value to make it as a PySpark literal. If a column is passed,
+    col : :class:`~pyspark.sql.Column`, str, int, float, bool or list, NumPy literals, ndarray
+        or dict. the value to make it as a PySpark literal. If a column is passed,
         it returns the column as is.
 
-        .. versionchanged:: 3.4.0
+        .. versionchanged:: 4.0.0
             Since 3.4.0, it supports the list type.
+            Since 4.0.0, it supports the dict type for the creation of a map.
 
     Returns
     -------
@@ -253,6 +255,34 @@ def lit(col: Any) -> Column:
     +------------------+-------+-----------------+--------------------+
     |     [true, false]|     []|       [1.5, 0.1]|           [a, b, c]|
     +------------------+-------+-----------------+--------------------+
+
+    Example 7: Creating a literal column as a map from a dict.
+        if ``spark.sql.pyspark.inferNestedDictAsStruct`` is False
+
+    >>> import pyspark.sql.functions as sf
+    >>> spark.conf.set("spark.sql.pyspark.inferNestedDictAsStruct.enabled", "false")
+    >>> spark.range(1).select(
+    ...    sf.lit({"a": 1, "b": 2}).alias("map_col")
+    ... ).show()
+    +----------------+
+    |         map_col|
+    +----------------+
+    |{a -> 1, b -> 2}|
+    +----------------+
+
+    Example 8: Creating a literal column as a struct from a dict.
+        if ``spark.sql.pyspark.inferNestedDictAsStruct`` is True
+
+    >>> import pyspark.sql.functions as sf
+    >>> spark.conf.set("spark.sql.pyspark.inferNestedDictAsStruct.enabled", "false")
+    >>> spark.range(1).select(
+    ...    sf.lit({"a": 1, "b": 2}, true).alias("struct_col")
+    ... ).show() # doctest: +SKIP
+    +----------------+
+    |      struct_col|
+    +----------------+
+    |{a -> 1, b -> 2}|
+    +----------------+
     """
     if isinstance(col, Column):
         return col
@@ -262,6 +292,18 @@ def lit(col: Any) -> Column:
                 errorClass="COLUMN_IN_LIST", messageParameters={"func_name": "lit"}
             )
         return array(*[lit(item) for item in col])
+    elif isinstance(col, dict):
+        # Skip checking if the keys are column as Columns are not hashable
+        # and cannot be used as dict keys in the first place.
+        if any(isinstance(value, Column) for value in col.values()):
+            raise PySparkValueError(
+                errorClass="COLUMN_IN_DICT", messageParameters={"func_name": "lit"}
+            )
+        dict_as_struct = _get_conf("spark.sql.pyspark.inferNestedDictAsStruct.enabled")
+        if dict_as_struct and dict_as_struct.lower() == "true":
+            return struct(*[lit(value).alias(key) for key, value in col.items()])
+        else:
+            return create_map(*[lit(x) for x in chain(*col.items())])
     elif _has_numpy:
         if isinstance(col, np.generic):
             dt = _from_numpy_type(col.dtype)

diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
@@ -29,10 +29,12 @@
 from pyspark.sql import Row, Window, functions as F, types
 from pyspark.sql.avro.functions import from_avro, to_avro
 from pyspark.sql.column import Column
+
 from pyspark.sql.functions.builtin import nullifzero, randstr, uniform, zeroifnull
-from pyspark.sql.types import StructType, StructField, StringType
+from pyspark.sql.types import StructType, StructField, StringType, MapType, IntegerType, LongType
 from pyspark.testing.sqlutils import ReusedSQLTestCase, SQLTestUtils
 from pyspark.testing.utils import have_numpy, assertDataFrameEqual
+from pyspark.sql.utils import get_conf as _get_conf
 
 
 class FunctionsTestsMixin:
@@ -1316,6 +1318,90 @@ def test_lit_list(self):
             messageParameters={"func_name": "lit"},
         )
 
+    # SPARK-48665: added support for dict type
+    def test_lit_dict_struct(self):
+        _get_conf.cache_clear()
+        with self.sql_conf({"spark.sql.pyspark.inferNestedDictAsStruct.enabled": True}):
+            # Not nested dict
+            expected_types = StructType(
+                [
+                    StructField("a", IntegerType(), False),
+                    StructField("b", IntegerType(), False),
+                ]
+            )
+            test_dict = {"a": 1, "b": 2}
+            actual = self.spark.range(1).select(F.lit(test_dict).alias("dict_to_struct"))
+            column_type = actual.schema["dict_to_struct"].dataType
+            actual = actual.first()[0]
+            self.assertEqual(column_type, expected_types)
+            self.assertEqual(actual, Row(a=1, b=2))
+
+            # Nested dict
+            expected_types = StructType(
+                [
+                    StructField("a", StructType([StructField("c", IntegerType(), False)]), False),
+                    StructField("b", StructType([StructField("d", IntegerType(), False)]), False),
+                ]
+            )
+            test_dict = {"a": {"c": 1}, "b": {"d": 2}}
+            actual = self.spark.range(1).select(F.lit(test_dict).alias("dict_to_struct"))
+            column_type = actual.schema["dict_to_struct"].dataType
+            print(column_type)
+            actual = actual.first()[0]
+            self.assertEqual(column_type, expected_types)
+            self.assertEqual(actual, Row(a=Row(c=1), b=Row(d=2)))
+
+    # SPARK-48665: added support for dict type
+    def test_lit_dict_map(self):
+        _get_conf.cache_clear()
+        with self.sql_conf({"spark.sql.pyspark.inferNestedDictAsStruct.enabled": False}):
+            # Not nested dict
+            test_dict = {"a": 1, "b": 2}
+            actual = self.spark.range(1).select(F.lit(test_dict).alias("dict_to_map"))
+            column_type = actual.schema["dict_to_map"].dataType
+            actual = actual.first()[0]
+            self.assertEqual(column_type, MapType(StringType(), IntegerType(), False))
+            self.assertEqual(actual, test_dict)
+
+            # Nested dict
+            test_dict = {"a": {"1": 1}, "b": {"2": 2}}
+            actual = self.spark.range(1).select(F.lit(test_dict).alias("dict_to_map"))
+            column_type = actual.schema["dict_to_map"].dataType
+            actual = actual.first()[0]
+            self.assertEqual(
+                column_type,
+                MapType(StringType(), MapType(StringType(), IntegerType(), False), False),
+            )
+            self.assertEqual(actual, test_dict)
+
+            # dict with multiple types 1 int, "2" string
+            test_dict = {"a": 1, "b": "2", "c": None}
+            expected_dict = {"a": 1, "b": 2, "c": None}
+            actual = self.spark.range(1).select(F.lit(test_dict).alias("dict_to_map"))
+            column_type = actual.schema["dict_to_map"].dataType
+            actual = actual.first()[0]
+            self.assertEqual(actual, expected_dict)
+            self.assertEqual(column_type, MapType(StringType(), LongType(), True))
+
+    # SPARK-48665: added support for dict type
+    def test_lit_dict_raise_error(self):
+        _get_conf.cache_clear()
+        df = self.spark.range(10)
+        dicts = [
+            {"a": df.id},
+            {"a": {"b": df.id}},
+        ]
+
+        for d in dicts:
+            with self.assertRaises(PySparkValueError) as pe:
+                F.lit(d)
+
+            self.check_error(
+                exception=pe.exception,
+                errorClass="COLUMN_IN_DICT",
+                messageParameters={"func_name": "lit"},
+            )
+
     # Test added for SPARK-39832; change Python API to accept both col & str as input
     def test_regexp_replace(self):
         df = self.spark.createDataFrame(

diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py
@@ -474,3 +474,18 @@ def remote_only(func: Union[Callable, property]) -> Union[Callable, property]:
     else:
         func._remote_only = True  # type: ignore[attr-defined]
         return func
+
+
+@functools.lru_cache()
+def get_conf(config_requested: str) -> Optional[str]:
+    from pyspark.sql import SparkSession
+
+    spark = SparkSession.getActiveSession()
+    if spark:
+        return spark.conf.get(config_requested)
+    from pyspark import SparkConf
+
+    conf = SparkConf()
+    if conf:
+        return conf.get(config_requested)
+    return None