diff --git a/README.md b/README.md index 0c17cd6..ca24129 100644 --- a/README.md +++ b/README.md @@ -32,4 +32,18 @@ To verify the automatically detected location, call ```python findspark.find() -``` \ No newline at end of file +``` + +Findspark can add a startup file to the current IPython profile so that the enviornment vaiables will be properly set and pyspark will be imported upon IPython startup. This file is created when `edit_profile` is set to true. + +```ipython --profile=myprofile +findspark.init('/path/to/spark_home', edit_profile=True) +``` + +Findspark can also add to the .bashrc configuration file if it is present so that the enviornment variables will be properly set whenever a new shell is opened. This is enabled by setting the optional argument `edit_rc` to true. + +```python +findspark.init('/path/to/spark_home', edit_rc=True) +``` + +If changes are persisted, findspark will not need to be called again unless the spark installation is moved. diff --git a/findspark.py b/findspark.py index d3d3195..be8152a 100644 --- a/findspark.py +++ b/findspark.py @@ -7,8 +7,10 @@ from glob import glob import os import sys +import subprocess +from IPython import get_ipython -__version__ = '0.0.3' +__version__ = '0.0.4' def find(): @@ -35,7 +37,67 @@ def find(): return spark_home -def init(spark_home=None): +def change_rc(spark_home, spark_python, py4j): + """Persists changes to enviornment by changing shell config. + + Adds lines to .bashrc to set enviornment variables + including the adding of dependencies to the system path. Will only + edit this file if they already exist. Currently only works for bash. + + Parameters + ---------- + spark_home : str + Path to Spark installation. + spark_python : str + Path to python subdirectory of Spark installation. + py4j : str + Path to py4j library. + """ + + bashrc_location = os.path.expanduser("~/.bashrc") + + if os.path.isfile(bashrc_location): + with open(bashrc_location, 'a') as bashrc: + bashrc.write("\n# Added by findspark\n") + bashrc.write("export SPARK_HOME=" + spark_home + "\n") + bashrc.write("export PYTHONPATH=" + spark_python + ":" + + py4j + ":$PYTHONPATH\n\n") + + +def edit_ipython_profile(spark_home, spark_python, py4j): + """Adds a startup file to the current IPython profile to import pyspark. + + The startup file sets the required enviornment variables and imports pyspark. + + Parameters + ---------- + spark_home : str + Path to Spark installation. + spark_python : str + Path to python subdirectory of Spark installation. + py4j : str + Path to py4j library. + """ + + ip = get_ipython() + + if ip: + profile_dir = ip.profile_dir.location + else: + from IPython.utils.path import locate_profile + profile_dir = locate_profile() + + startup_file_loc = os.path.join(profile_dir, "startup", "findspark.py") + + with open(startup_file_loc, 'w') as startup_file: + #Lines of code to be run when IPython starts + startup_file.write("import sys, os\n") + startup_file.write("os.environ['SPARK_HOME'] = '" + spark_home + "'\n") + startup_file.write("sys.path[:0] = " + str([spark_python, py4j]) + "\n") + startup_file.write("import pyspark\n") + + +def init(spark_home=None, edit_rc=False, edit_profile=False): """Make pyspark importable. Sets environmental variables and adds dependencies to sys.path. @@ -45,7 +107,13 @@ def init(spark_home=None): ---------- spark_home : str, optional, default = None Path to Spark installation, will try to find automatically - if not provided + if not provided. + edit_rc : bool, optional, default = False + Whether to attempt to persist changes by appending to shell + config. + edit_profile : bool, optional, default = False + Whether to create an IPython startup file to automatically + configure and import pyspark. """ if not spark_home: @@ -58,3 +126,9 @@ def init(spark_home=None): spark_python = os.path.join(spark_home, 'python') py4j = glob(os.path.join(spark_python, 'lib', 'py4j-*.zip'))[0] sys.path[:0] = [spark_python, py4j] + + if edit_rc: + change_rc(spark_home, spark_python, py4j) + + if edit_profile: + edit_ipython_profile(spark_home, spark_python, py4j)