Skip to content

Commit

Permalink
Merge pull request #618 from HazyResearch/slimdown
Browse files Browse the repository at this point in the history
Fixes to various bugs and packaging issues with the option to slim things down
  • Loading branch information
netj authored Jan 10, 2017
2 parents d50cdeb + efd668f commit 5c28bea
Show file tree
Hide file tree
Showing 18 changed files with 177 additions and 89 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@
# python-related
*.pyc
__pycache__/

# conda related
/conda_build.sh
10 changes: 6 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -184,10 +184,12 @@ checkstyle:

# submodules to build and the files to copy out from each of them
include util/build/build-submodules.mk
$(BUILD_SUBMODULE)/inference/dimmwitted.mk : COPY = dw
$(BUILD_SUBMODULE)/util/mindbender.mk : COPY = @prefix@/
$(BUILD_SUBMODULE)/compiler/ddlog.mk : COPY = target/scala-2.11/ddlog-assembly-0.1-SNAPSHOT.jar
$(BUILD_SUBMODULE)/runner/mkmimo.mk : COPY = mkmimo
$(call BUILD_SUBMODULE_AND_COPY, inference/dimmwitted, dw )
$(call BUILD_SUBMODULE_AND_COPY, compiler/ddlog , target/scala-2.11/ddlog-assembly-0.1-SNAPSHOT.jar )
$(call BUILD_SUBMODULE_AND_COPY, runner/mkmimo , mkmimo )
ifndef NO_MINDBENDER
$(call BUILD_SUBMODULE_AND_COPY, util/mindbender , @prefix@/ )
endif

# XXX legacy targets kept to reduce surprise
.PHONY: build-sampler
Expand Down
51 changes: 42 additions & 9 deletions compiler/compile-codegen
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,32 @@
# Author: Jaeho Shin <[email protected]>
# Created: 2015-12-05
set -euo pipefail
pids=(--)

: ${DEEPDIVE_NUM_PROCESSES:=$(nproc --ignore=1)}

# a handy barrier to not spawn too many codegen processes
num_active=0
wait_if_needed() {
local pid=$!
if [[ $num_active -ge $DEEPDIVE_NUM_PROCESSES ]]; then
wait -n
let num_active--
fi
let ++num_active
}
wait_all() {
while [[ $num_active -gt 0 ]]; do
wait -n
let num_active--
done
}

# translate codegen instructions in JSON into commands and run them in parallel
eval "$(jq -r '
@sh "
(
( @sh "
{
x=\(.path)
echo >&2 \" run/$x\"
echo >&2 \" run/$x (PID $BASHPID)\"
# ensure parent dir exists
xd=\"$(dirname \"$x\")\"
mkdir -p \"$xd\"
Expand All @@ -18,19 +38,32 @@ eval "$(jq -r '
chmod +w \"$x\" \"$xd\" || true
mv -f --no-target-directory \"$x\" \"$x\"~
fi
# write contents
echo \(.content) >\"$x\"
"
+ (.content |
def find_no_occurs(eof; repeat):
if contains(eof * repeat) | not then eof * repeat
else find_no_occurs(eof; repeat+1)
end;
"\(find_no_occurs("__EOF"; 1))__" as $EOF | "
# write contents
cat >\"$x\" <<'\''\($EOF)'\''
\(.)
\($EOF)
")
+ (.mode | if . then @sh "
# adjust mode
chmod \(.) \"$x\""
else ""
end)
# finally, make sure each codegen runs asynchronously
+ @sh "
) & pids+=($!) # run each codegen asynchronously
} & wait_if_needed
"
)
')"
waitall() { shift; local pid=; for pid; do wait $pid; done; }
waitall "${pids[@]}" # ensure every process exits with zero status

# wait for all asynchronous processes to finish
wait_all
5 changes: 5 additions & 0 deletions compiler/deepdive-check
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
set -euo pipefail

: ${DEEPDIVE_CHECK_HOME:="$DEEPDIVE_HOME/util/compile-check"}
: ${DEEPDIVE_CHECK_SKIP:=}

## parse options
Mode=run_one_checker
Expand Down Expand Up @@ -105,6 +106,10 @@ run_all_checkers() {
[[ -x "$chkr" ]] || continue
local name=${chkr##"$DEEPDIVE_CHECK_HOME"/}
name=${name#compile-check-*-}
if [[ -n $DEEPDIVE_CHECK_SKIP ]] && grep -qFf <(printf '%s\n' $DEEPDIVE_CHECK_SKIP) <<<"$name"; then
echo "skipping $name (as per \$DEEPDIVE_CHECK_SKIP)"
continue
fi
echo "checking if $name"
run_one_checker_by_path "$chkr" >/dev/null & pids+=($!) # run checkers asynchronously
done
Expand Down
12 changes: 8 additions & 4 deletions database/db-driver/postgresql/db-query-tsj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python
# db-query-tsj -- Runs SQL against PostgreSQL and formats output rows in TSJ
#
# $ db-query-tsj SQL COLUMN_TYPE...
Expand All @@ -18,7 +18,6 @@ column_types = sys.argv[2:]
# write TSJ to stdout
tsj_output = sys.stdout


# SQL types that are already formatted in JSON (to bypass ujson.dump)
TYPES_TO_PASS_THRU = [ "json" ]

Expand Down Expand Up @@ -47,8 +46,8 @@ for i,ty in enumerate(column_types):
else: # dump in JSON except a few
stmts.append('ujson.dump(columns[%d], tsj_output)' % i)
stmts.append('tsj_output.write("\\n")')
exec compile("def write_tsj_output(columns):\n" + "\n".join(" " + s for s in stmts),
"%s codegen" % sys.argv[0], "exec")
exec(compile("def write_tsj_output(columns):\n" + "\n".join(" " + s for s in stmts),
"%s codegen" % sys.argv[0], "exec"))

# tell psycopg2 to actually skip parsing any JSON
psycopg2.extras.register_default_json(loads=lambda x: x)
Expand All @@ -67,6 +66,11 @@ conn = psycopg2.connect(
)
# execute given SQL query, and output each row in TSJ format
with conn:
# NOTE JSON is always UTF-8, so the following line is crucial to keep the
# code safe from user's locale playing with Python's absolute absurdness of
# keeping 'ascii' as default encoding despite changing everything else in
# their programming language
conn.set_client_encoding("UTF-8")
with conn.cursor("named") as curs:
curs.execute(sql)
for columns in curs:
Expand Down
3 changes: 1 addition & 2 deletions extern/bundle-all.conf
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ moreutils
graphviz
bc
pbzip2
python-virtualenv
postgresql
python-lib
perl-local-lib

postgresql
2 changes: 1 addition & 1 deletion extern/bundle-none.conf
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# NOTE this list is kept empty to bundle none of the runtime dependencies
python-virtualenv
python-lib
perl-local-lib
29 changes: 29 additions & 0 deletions extern/bundled/python-lib/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env bash
set -euo pipefail -x

unset PYTHONPATH # existing PYTHONPATH can interfere

# install Python requirements with pip
fetch-verify get-pip.py https://bootstrap.pypa.io/get-pip.py
for python in python2.7 python3.{6,5,4}; do
type $python || continue
( export PYTHONPATH="$PWD"/prefix/lib/$python/site-packages
$python get-pip.py --upgrade --force-reinstall --ignore-installed --prefix prefix setuptools
$python get-pip.py --upgrade --force-reinstall --ignore-installed --prefix prefix -r requirements.txt
)
done

# remove pip and setuptools
shopt -s extglob
rm -rf prefix/bin/@(pip@(|[23]*)|wheel) prefix/lib/python*/site-packages/@(pip|setuptools|easy_install)@(|-*)

# make sure no entrypoints have absolute path to python in its shebang
for cmd in prefix/bin/*; do
head -1 "$cmd" | grep -q '^#!/[^[:space:]]*/python.*$' || continue
sed -e '1s:^#!/[^[:space:]]*/python.*$:#!/usr/bin/env python:' -i~ "$cmd"
rm -f "$cmd"~
done

# make sure things are properly exposed
symlink-under-depends-prefix lib -d prefix/lib/python*
symlink-under-depends-prefix bin -x prefix/bin/!(activate@(|.*|_this.py)|python@(|[23]*|-config)|pip@(|[23]*)|easy_install@(|-[23]*)|wheel)
File renamed without changes.
40 changes: 0 additions & 40 deletions extern/bundled/python-virtualenv/install.sh

This file was deleted.

45 changes: 33 additions & 12 deletions runner/compute-driver/local/compute-execute
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,43 @@ eval "$(jq2sh <<<"$DEEPDIVE_COMPUTER_CONFIG" \
num_processes='.num_processes' \
num_parallel_unloads='.num_parallel_unloads' \
num_parallel_loads='.num_parallel_loads' \
named_pipes_dir='.named_pipes_dir' \
#
)"
# respect the DEEPDIVE_NUM_PROCESSES environment
num_processes=${DEEPDIVE_NUM_PROCESSES:-${num_processes:-$(nproc --ignore=1)}}
num_parallel_unloads=${DEEPDIVE_NUM_PARALLEL_UNLOADS:-${num_parallel_unloads:-1}}
num_parallel_loads=${DEEPDIVE_NUM_PARALLEL_LOADS:-${num_parallel_loads:-1}}

# ensure mkfifo works on named_pipes_dir, respecting DEEPDIVE_NAMED_PIPES_DIR
named_pipes_dir=${DEEPDIVE_NAMED_PIPES_DIR:-${named_pipes_dir:-$PWD}}
can_create_named_pipes() {
mkfifo "$pipes_dir"/.deepdive_named_pipe_probe.$$
rm -f "$pipes_dir"/.deepdive_named_pipe_probe.$$
}
{ pipes_dir=$named_pipes_dir && can_create_named_pipes; } ||
{ pipes_dir=$HOME && can_create_named_pipes; } ||
{ pipes_dir=${TMPDIR:=/tmp} && can_create_named_pipes; } ||
error "None of the following paths allow creation of named pipes:" \
" $named_pipes_dir" \
" $HOME" \
" $TMPDIR" \
"Please set DEEPDIVE_NAMED_PIPES_DIR to a writable path in a filesystem that allows mkfifo" \
#

# declare all input arguments
declare -- "$@"

# create a subdirectory for collocating named pipes
pipes=$(mktemp -d "$pipes_dir/deepdive-compute-execute.XXXXXXX")
trap 'rm -rf "$pipes"' EXIT

# show configuration
echo "Executing with the following configuration:"
echo " DEEPDIVE_NUM_PROCESSES=$num_processes"
echo " DEEPDIVE_NUM_PARALLEL_UNLOADS=$num_parallel_unloads"
echo " DEEPDIVE_NUM_PARALLEL_LOADS=$num_parallel_loads"
echo " DEEPDIVE_NAMED_PIPES_DIR=$pipes_dir"

# XXX there are conditional branches below depending on whether input_sql
# and/or output_relation is given, to support four use cases:
Expand All @@ -39,29 +61,28 @@ echo " DEEPDIVE_NUM_PARALLEL_LOADS=$num_parallel_loads"
# 4) database-independent command which simply runs in parallel

# set up named pipes for parallel processes and make sure they are cleaned up upon exit
[[ -z $input_sql ]] || for i in $(seq $num_processes); do rm -f process-$i.input ; mkfifo process-$i.input ; done
[[ -z $output_relation ]] || for i in $(seq $num_processes); do rm -f process-$i.output; mkfifo process-$i.output; done
trap 'rm -f process-*.{input,output} feed_processes-* output_computed-*' EXIT
[[ -z $input_sql ]] || for i in $(seq $num_processes); do mkfifo "$pipes"/process-$i.input ; done
[[ -z $output_relation ]] || for i in $(seq $num_processes); do mkfifo "$pipes"/process-$i.output; done
# now spawn processes attached to the named pipes in reverse order (from sink to source)
pids_command=() pids_load=() pids_unload=()

# spawn multiple processes attached to the pipes
if [[ -n $output_relation && -n $input_sql ]]; then # process with input from/output to database
for i in $(seq $num_processes); do
DEEPDIVE_CURRENT_PROCESS_INDEX=$i \
bash -c "$command" <process-$i.input >process-$i.output &
bash -c "$command" <"$pipes"/process-$i.input >"$pipes"/process-$i.output &
pids_command+=($!)
done
elif [[ -n $input_sql ]]; then # input-only process
for i in $(seq $num_processes); do
DEEPDIVE_CURRENT_PROCESS_INDEX=$i \
bash -c "$command" <process-$i.input &
bash -c "$command" <"$pipes"/process-$i.input &
pids_command+=($!)
done
elif [[ -n $output_relation ]]; then # output-only process
for i in $(seq $num_processes); do
DEEPDIVE_CURRENT_PROCESS_INDEX=$i \
bash -c "$command" >process-$i.output &
bash -c "$command" >"$pipes"/process-$i.output &
pids_command+=($!)
done
else # neither output_relation nor input_sql specified
Expand All @@ -74,23 +95,23 @@ fi

if [[ -n $output_relation ]]; then
# set up pipes for parallel loads
rm -f output_computed-*; for i in $(seq $num_parallel_loads); do mkfifo output_computed-$i; done
for i in $(seq $num_parallel_loads); do mkfifo "$pipes"/output_computed-$i; done
# use mkmimo again to merge outputs of multiple processes into a single stream
mkmimo process-*.output \> output_computed-* &
mkmimo "$pipes"/process-*.output \> "$pipes"/output_computed-* &
pids_load+=($!)
# load the output data to the temporary table in the database
deepdive-load "$output_relation" output_computed-* &
deepdive-load "$output_relation" "$pipes"/output_computed-* &
pids_load+=($!)
fi

if [[ -n $input_sql ]]; then
# set up pipes for parallel unloads
rm -f feed_processes-*; for i in $(seq $num_parallel_unloads); do mkfifo feed_processes-$i; done
for i in $(seq $num_parallel_unloads); do mkfifo "$pipes"/feed_processes-$i; done
# unload data from the database and pour into the pipes
deepdive-db unload "$input_sql" "$DEEPDIVE_LOAD_FORMAT" feed_processes-* &
deepdive-db unload "$input_sql" "$DEEPDIVE_LOAD_FORMAT" "$pipes"/feed_processes-* &
pids_unload+=($!)
# use mkmimo to distribute input data to multiple processes
mkmimo feed_processes-* \> process-*.input &
mkmimo "$pipes"/feed_processes-* \> "$pipes"/process-*.input &
pids_unload+=($!)
fi

Expand Down
16 changes: 12 additions & 4 deletions shell/deepdive
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,19 @@ if ! [[ "${DEEPDIVE_SHELL:-}" -ef "$0" ]]; then

DEEPDIVE_BUNDLED_PREFIX="$DEEPDIVE_HOME"/lib/bundled/.all

# make sure virtualenv in Python is available
PATH="$DEEPDIVE_HOME/lib/bundled/python-virtualenv/prefix/bin:$PATH"
# make sure DeepDive bundled executables are on the PATH
PATH="$DEEPDIVE_BUNDLED_PREFIX/bin:$PATH"

# make sure bundled Python libraries and their entrypoint scripts are available
PATH="$DEEPDIVE_HOME/lib/bundled/python-lib/prefix/bin:$PATH"
python_version=$(python -V 2>&1 | sed 's/^Python \([0-9]*\.[0-9]*\)\..*/\1/')
for python_lib_dir in "$DEEPDIVE_HOME"/lib/bundled/python-lib/prefix/lib/python"$python_version"/site-packages
do [[ -d "$python_lib_dir" ]] || continue
PYTHONPATH="$python_lib_dir${PYTHONPATH:+:$PYTHONPATH}"
done

# make sure ddlib in Python is available
PYTHONPATH="$DEEPDIVE_HOME"/lib/python:"${PYTHONPATH:+:$PYTHONPATH}"
PYTHONPATH="$DEEPDIVE_HOME"/lib/python"${PYTHONPATH:+:$PYTHONPATH}"
export PYTHONPATH

# make sure bundled Perl modules are available via local::lib
Expand All @@ -61,7 +69,7 @@ if ! [[ "${DEEPDIVE_SHELL:-}" -ef "$0" ]]; then
set -u

# make sure DeepDive utilities are at the beginning of the PATH
PATH="$DEEPDIVE_HOME/util:$DEEPDIVE_HOME/bin:$DEEPDIVE_BUNDLED_PREFIX/bin:$PATH"
PATH="$DEEPDIVE_HOME/util:$DEEPDIVE_HOME/bin:$PATH"
export PATH

# make sure bash is used as the default shell, e.g., by system(3), by make(1)
Expand Down
2 changes: 2 additions & 0 deletions stage.sh
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,10 @@ stage util/draw_calibration_plot util/
stage util/calibration.py util/
stage util/calibration.plg util/

if [[ -z ${NO_MINDBENDER:-} ]]; then
stage .build/submodule/util/mindbender/@prefix@/ mindbender/
stage util/mindbender-wrapper.sh bin/mindbender
fi

# runtime dependencies after building them from source
! [[ -e extern/.build/bundled ]] ||
Expand Down
Loading

0 comments on commit 5c28bea

Please sign in to comment.