1 | #!/usr/bin/env bash
|
2 | #
|
3 | # Why is CPython configure slower under OSH?
|
4 | #
|
5 | # Usage:
|
6 | # benchmarks/autoconf.sh <function name>
|
7 | #
|
8 | # Examples:
|
9 | # $0 patch-pyconf # times builtin
|
10 | # $0 measure-times # time-tsv with gc stats
|
11 | # $0 report-times # times builtin
|
12 | #
|
13 | # $0 measure-syscalls
|
14 | # $0 report-syscalls
|
15 | # $0 report-processes
|
16 | # $0 report-external
|
17 | #
|
18 | # Simpler:
|
19 | # $0 measure-rusage # time-tsv
|
20 |
|
21 | set -o nounset
|
22 | set -o pipefail
|
23 | set -o errexit
|
24 |
|
25 | REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
|
26 |
|
27 | source benchmarks/cachegrind.sh # with-cachegrind
|
28 | source benchmarks/callgrind.sh # with-cachegrind
|
29 | source test/tsv-lib.sh # $TAB
|
30 |
|
31 | readonly BASE_DIR_RELATIVE=_tmp/autoconf
|
32 | readonly BASE_DIR=$REPO_ROOT/$BASE_DIR_RELATIVE
|
33 | readonly PY_CONF=$REPO_ROOT/Python-2.7.13/configure
|
34 |
|
35 | #
|
36 | # Trying to measure allocation/GC overhead
|
37 | #
|
38 | # This DOES NOT HELP because bumpleak/bumproot are **slower** on bigger heaps.
|
39 | # There's less cache locality!
|
40 | #
|
41 |
|
42 | cpython-configure-tasks() {
|
43 | local -a variants=( opt+bumpleak opt+bumproot opt )
|
44 | for v in ${variants[@]}; do
|
45 | echo "${v}${TAB}_bin/cxx-$v/osh"
|
46 | done
|
47 | }
|
48 |
|
49 | cpython-setup() {
|
50 | cpython-configure-tasks | while read -r _ osh; do
|
51 | ninja $osh
|
52 | done
|
53 | }
|
54 |
|
55 | measure-alloc-overhead() {
|
56 | local base_dir=$BASE_DIR/alloc-overhead
|
57 | rm -r -f -v $base_dir
|
58 |
|
59 | cpython-configure-tasks | while read -r variant osh; do
|
60 | osh=$REPO_ROOT/$osh
|
61 |
|
62 | local task_dir=$base_dir/$variant
|
63 |
|
64 | mkdir -p $task_dir
|
65 | pushd $task_dir > /dev/null
|
66 |
|
67 | local -a flags=(
|
68 | --output "$base_dir/$variant.tsv"
|
69 | --rusage
|
70 | )
|
71 |
|
72 | local -a time_argv
|
73 |
|
74 | time_argv=(
|
75 | time-tsv --print-header
|
76 | "${flags[@]}"
|
77 | --field variant
|
78 | )
|
79 | "${time_argv[@]}"
|
80 |
|
81 | time_argv=(
|
82 | time-tsv --append
|
83 | "${flags[@]}"
|
84 | --field "$variant"
|
85 | -- $osh $PY_CONF
|
86 | )
|
87 |
|
88 | #echo "${time_argv[@]}"
|
89 | "${time_argv[@]}"
|
90 |
|
91 | popd > /dev/null
|
92 |
|
93 | done
|
94 | }
|
95 |
|
96 | #
|
97 | # Compare bash/dash/osh locally
|
98 | #
|
99 |
|
100 | measure-rusage() {
|
101 | local base_dir=$BASE_DIR/rusage
|
102 | rm -r -f -v $base_dir
|
103 |
|
104 | shell-tasks | while read -r sh_label sh_path; do
|
105 |
|
106 | local task_dir=$base_dir/$sh_label
|
107 |
|
108 | mkdir -p $task_dir
|
109 | pushd $task_dir > /dev/null
|
110 |
|
111 | local -a flags=(
|
112 | --output "$base_dir/$sh_label.tsv"
|
113 | --rusage
|
114 | )
|
115 |
|
116 | local -a time_argv
|
117 |
|
118 | time_argv=(
|
119 | time-tsv --print-header
|
120 | "${flags[@]}"
|
121 | --field sh_label
|
122 | )
|
123 | "${time_argv[@]}"
|
124 |
|
125 | time_argv=(
|
126 | time-tsv --append
|
127 | "${flags[@]}"
|
128 | --field "$sh_label"
|
129 | -- $sh_path $PY_CONF
|
130 | )
|
131 |
|
132 | #echo "${time_argv[@]}"
|
133 | "${time_argv[@]}"
|
134 |
|
135 | popd > /dev/null
|
136 |
|
137 | done
|
138 | }
|
139 |
|
140 | #
|
141 | # Now try strace
|
142 | #
|
143 |
|
144 | shell-tasks() {
|
145 | echo "bash${TAB}bash"
|
146 | echo "dash${TAB}dash"
|
147 | echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt/osh"
|
148 | }
|
149 |
|
150 | measure-syscalls() {
|
151 | local osh=_bin/cxx-opt/osh
|
152 | #local osh=_bin/cxx-dbg/osh
|
153 |
|
154 | ninja $osh
|
155 |
|
156 | local base_dir=$BASE_DIR/syscalls
|
157 |
|
158 | rm -r -f -v $base_dir
|
159 |
|
160 | shell-tasks | while read -r sh_label sh_path; do
|
161 | local dir=$base_dir/$sh_label
|
162 | mkdir -p $dir
|
163 |
|
164 | local counts_dir=$base_dir/$sh_label
|
165 | mkdir -p $counts_dir
|
166 |
|
167 | pushd $dir
|
168 | #strace -o $counts -c $sh_path $PY_CONF
|
169 | # See how many external processes are started?
|
170 | #strace -o $counts -ff -e execve $sh_path $PY_CONF
|
171 | strace -o $counts_dir/syscalls -ff $sh_path $PY_CONF
|
172 | popd
|
173 | done
|
174 | }
|
175 |
|
176 | # --- _tmp/autoconf/bash
|
177 | # 6047
|
178 | # 4621
|
179 | # --- _tmp/autoconf/dash
|
180 | # 6088
|
181 | # 4627
|
182 | # --- _tmp/autoconf/osh
|
183 | # 5691
|
184 | # 4631
|
185 | #
|
186 | # Woah we start fewer processes! But are not faster?
|
187 |
|
188 | grep-exec() {
|
189 | egrep --no-filename -o 'execve\("[^"]+' "$@"
|
190 | }
|
191 |
|
192 | # andy@hoover:~/git/oilshell/oil$ benchmarks/autoconf.sh report-syscalls
|
193 | # --- _tmp/autoconf/syscalls/bash
|
194 | # 2592 _tmp/autoconf/syscalls/bash/syscalls.903220
|
195 | # 2608 _tmp/autoconf/syscalls/bash/syscalls.898727
|
196 | # 2632 _tmp/autoconf/syscalls/bash/syscalls.898387
|
197 | # 2679 _tmp/autoconf/syscalls/bash/syscalls.898292
|
198 | # 2853 _tmp/autoconf/syscalls/bash/syscalls.898927
|
199 | # 2873 _tmp/autoconf/syscalls/bash/syscalls.898334
|
200 | # 2920 _tmp/autoconf/syscalls/bash/syscalls.898895
|
201 | # 3204 _tmp/autoconf/syscalls/bash/syscalls.898664
|
202 | # 112549 _tmp/autoconf/syscalls/bash/syscalls.897471
|
203 | # 1360223 total
|
204 | #
|
205 | # --- _tmp/autoconf/syscalls/dash
|
206 | # 2592 _tmp/autoconf/syscalls/dash/syscalls.909344
|
207 | # 2607 _tmp/autoconf/syscalls/dash/syscalls.904921
|
208 | # 2630 _tmp/autoconf/syscalls/dash/syscalls.904581
|
209 | # 2683 _tmp/autoconf/syscalls/dash/syscalls.904486
|
210 | # 2851 _tmp/autoconf/syscalls/dash/syscalls.905109
|
211 | # 2873 _tmp/autoconf/syscalls/dash/syscalls.904528
|
212 | # 2920 _tmp/autoconf/syscalls/dash/syscalls.905088
|
213 | # 3204 _tmp/autoconf/syscalls/dash/syscalls.904858
|
214 | # 112922 _tmp/autoconf/syscalls/dash/syscalls.903626
|
215 | # 1372118 total
|
216 | #
|
217 | # --- _tmp/autoconf/syscalls/osh
|
218 | # 2592 _tmp/autoconf/syscalls/osh/syscalls.915226
|
219 | # 2607 _tmp/autoconf/syscalls/osh/syscalls.910993
|
220 | # 2630 _tmp/autoconf/syscalls/osh/syscalls.910647
|
221 | # 2679 _tmp/autoconf/syscalls/osh/syscalls.910561
|
222 | # 2851 _tmp/autoconf/syscalls/osh/syscalls.911162
|
223 | # 2873 _tmp/autoconf/syscalls/osh/syscalls.910599
|
224 | # 2920 _tmp/autoconf/syscalls/osh/syscalls.911143
|
225 | # 3204 _tmp/autoconf/syscalls/osh/syscalls.910936
|
226 | # 72921 _tmp/autoconf/syscalls/osh/syscalls.909769
|
227 | # 1211074 total
|
228 |
|
229 | report-processes() {
|
230 | for sh_dir in $BASE_DIR_RELATIVE/syscalls/*; do
|
231 | echo "--- $sh_dir"
|
232 | ls $sh_dir/* | wc -l
|
233 | grep-exec $sh_dir/syscalls.* | wc -l
|
234 | echo
|
235 |
|
236 | done
|
237 | }
|
238 |
|
239 | report-external() {
|
240 | local n=${1:-5}
|
241 |
|
242 | for sh_dir in $BASE_DIR_RELATIVE/syscalls/*; do
|
243 | echo "--- $sh_dir"
|
244 |
|
245 | grep-exec $sh_dir/syscalls.* | sort | uniq -c | sort -n | tail -n $n
|
246 | echo
|
247 | done
|
248 | }
|
249 |
|
250 | report-syscalls() {
|
251 | # Hm this is instructive, the shell itself makes the most syscalls
|
252 | # And fewer than other shells?
|
253 |
|
254 | for sh_dir in $BASE_DIR_RELATIVE/syscalls/*; do
|
255 | echo "--- $sh_dir"
|
256 | wc -l $sh_dir/syscalls.* | sort -n | tail
|
257 | echo
|
258 | done
|
259 | }
|
260 |
|
261 | #
|
262 | # Cachegrind
|
263 | #
|
264 |
|
265 | measure-valgrind() {
|
266 | local tool=$1
|
267 |
|
268 | # opt seems to give OK results, but I thought dbg was more accurate
|
269 | #local osh=_bin/cxx-opt/osh
|
270 | local osh=_bin/cxx-dbg/osh
|
271 |
|
272 | ninja $osh
|
273 |
|
274 | local osh=$REPO_ROOT/$osh
|
275 |
|
276 | local base_dir=$REPO_ROOT/_tmp/$tool
|
277 |
|
278 | local dir=$base_dir/cpython-configure
|
279 | rm -r -f -v $dir
|
280 |
|
281 | local out_file=$base_dir/cpython-configure.txt
|
282 |
|
283 | mkdir -v -p $dir
|
284 |
|
285 | pushd $dir
|
286 | $tool $out_file $osh $PY_CONF
|
287 | popd
|
288 | }
|
289 |
|
290 | measure-cachegrind() {
|
291 | measure-valgrind with-cachegrind
|
292 | }
|
293 |
|
294 | measure-callgrind() {
|
295 | # This takes ~5 minutes with opt binary, ~6:43 with dbg
|
296 | # vs ~15 seconds uninstrumented
|
297 | time measure-valgrind with-callgrind
|
298 | }
|
299 |
|
300 | # Note:
|
301 | # benchmarks/osh-runtime.sh compares our release, which does not have #ifdef
|
302 | # GC_TIMING, so we don't know total GC time.
|
303 |
|
304 | # TODO:
|
305 | #
|
306 | # - Run locally, reproduce GC_TIMING - this is not in the release build
|
307 | # - it seems to say only 143 ms total GC time, but we're seeing 1.5+ seconds
|
308 | # slowdown on Cpython configure vs. bash
|
309 | # - I want a local run that automates it, and returns PERCENTAGES for elapsed
|
310 | # time, sys time, user time
|
311 | # - We also might not want to amortize free() inside Allocate()
|
312 | # - #ifdef LAZY_FREE I think! That might show a big slowdown with free
|
313 |
|
314 | patch-pyconf() {
|
315 | #sed -i $'s/ac_compile=\'$CC/ac_compile=\'times; $CC/g' $PY_CONF
|
316 |
|
317 | # temporary
|
318 | echo 'times > $SH_BENCHMARK_TIMES' >> $PY_CONF
|
319 | }
|
320 |
|
321 | measure-times() {
|
322 | local osh=_bin/cxx-opt/osh
|
323 | ninja $osh
|
324 |
|
325 | local base_dir=$BASE_DIR/times
|
326 | rm -r -f -v $base_dir
|
327 |
|
328 | local trace_dir=$base_dir/oils-trace
|
329 | mkdir -p $trace_dir
|
330 |
|
331 | shell-tasks | while read -r sh_label sh_path; do
|
332 | #case $sh_label in bash|dash) continue ;; esac
|
333 |
|
334 | local dir=$base_dir/$sh_label
|
335 | mkdir -p $dir
|
336 |
|
337 | pushd $dir
|
338 |
|
339 | local -a flags=(
|
340 | --output "$base_dir/$sh_label.tsv"
|
341 | --rusage
|
342 | )
|
343 |
|
344 | local -a time_argv
|
345 |
|
346 | time_argv=(
|
347 | time-tsv --print-header
|
348 | "${flags[@]}"
|
349 | --field sh_label
|
350 | )
|
351 | "${time_argv[@]}"
|
352 |
|
353 | time_argv=(
|
354 | time-tsv --append
|
355 | "${flags[@]}"
|
356 | --field "$sh_label"
|
357 | -- $sh_path $PY_CONF
|
358 | )
|
359 |
|
360 | #echo "${time_argv[@]}"
|
361 |
|
362 | # 1269 argv0.json files created
|
363 | # we can miss some via NOLASTFORK optimization
|
364 | #OILS_TRACE_DIR=$trace_dir \
|
365 |
|
366 | _OILS_GC_VERBOSE=1 OILS_GC_STATS_FD=99 \
|
367 | SH_BENCHMARK_TIMES=$base_dir/$sh_label.times.txt \
|
368 | "${time_argv[@]}" \
|
369 | 99>$base_dir/$sh_label.gc-stats.txt
|
370 |
|
371 | popd
|
372 | done
|
373 | }
|
374 |
|
375 | inner-long-tsv() {
|
376 | python2 -c '
|
377 | import os, re, sys
|
378 |
|
379 | def PrintRow(row):
|
380 | print("\t".join(row))
|
381 |
|
382 | PrintRow(["shell", "who", "what", "seconds"])
|
383 |
|
384 | for path in sys.argv[1:]:
|
385 | filename = os.path.basename(path)
|
386 | shell = filename.split(".")[0]
|
387 |
|
388 | f = open(path)
|
389 | s = f.read()
|
390 |
|
391 | secs = re.findall("0m([0-9.]+)s", s)
|
392 | assert len(secs) == 4, secs
|
393 |
|
394 | PrintRow([shell, "self", "user", secs[0]])
|
395 | PrintRow([shell, "self", "sys", secs[1]])
|
396 | PrintRow([shell, "child", "user", secs[2]])
|
397 | PrintRow([shell, "child", "sys", secs[3]])
|
398 |
|
399 | # Non-normalized, but OK
|
400 | total_secs = sum(float(s) for s in secs)
|
401 | PrintRow([shell, "both", "both", str(total_secs)])
|
402 |
|
403 | ' $BASE_DIR/times/*.times.txt
|
404 | }
|
405 |
|
406 | compare-dim() {
|
407 | # 8% more child system time
|
408 | local who=${1:-child}
|
409 | local what=${2:-user}
|
410 |
|
411 | echo "=== $who $what ==="
|
412 |
|
413 | # Annoying
|
414 | # https://www.math.utah.edu/docs/info/gawk_8.html
|
415 | # "If, for some reason, you need to force a number to be converted to a
|
416 | # string, concatenate the empty string, "", with that number. To force a
|
417 | # string to be converted to a number, add zero to that string."
|
418 |
|
419 | cat $BASE_DIR/times-long.tsv | awk -v "who=$who" -v "what=$what" '
|
420 | BEGIN {
|
421 | TAB = "\t"
|
422 |
|
423 | i = 0
|
424 |
|
425 | printf "%s\t%s\t%s\t%s\n", "shell", "secs", "ratio", "diff secs"
|
426 | }
|
427 | $2 == who && $3 == what {
|
428 | if (i == 0) {
|
429 | first_secs = $4 + 0
|
430 | }
|
431 | i++
|
432 |
|
433 | secs = $4 + 0
|
434 | ratio = secs / first_secs
|
435 | diff = secs - first_secs
|
436 |
|
437 | # Need commas for OFMT to work correctly?
|
438 | printf "%s\t%5.3f\t%5.3f\t%5.3f\n", $1, secs, ratio, diff
|
439 | }
|
440 | '
|
441 |
|
442 | echo
|
443 | }
|
444 |
|
445 | compare-times() {
|
446 | log "INNER"
|
447 | log ''
|
448 |
|
449 | compare-dim self user
|
450 |
|
451 | compare-dim self sys
|
452 |
|
453 | compare-dim child user
|
454 |
|
455 | compare-dim child sys
|
456 |
|
457 | compare-dim both both
|
458 |
|
459 | # outer
|
460 | log "OUTER"
|
461 | log ''
|
462 |
|
463 | compare-dim both elapsed
|
464 |
|
465 | # These kinda match
|
466 | return
|
467 | compare-dim both user
|
468 | compare-dim both sys
|
469 | }
|
470 |
|
471 | outer-long-tsv() {
|
472 | log "=== outer times ==="
|
473 | awk '
|
474 | BEGIN {
|
475 | i = 0
|
476 |
|
477 | printf "%s\t%s\t%s\t%s\n", "shell", "who", "what", "seconds"
|
478 | }
|
479 | i == 0 {
|
480 | #print "Skipping header"
|
481 | i++
|
482 | next
|
483 | }
|
484 | i >= 1 {
|
485 | elapsed = $2 + 0
|
486 | user = $3 + 0
|
487 | sys = $4 + 0
|
488 | sh_label = $6
|
489 |
|
490 | printf "%s\t%s\t%s\t%5.3f\n", sh_label, "both", "elapsed", elapsed
|
491 | printf "%s\t%s\t%s\t%5.3f\n", sh_label, "both", "user", user
|
492 | printf "%s\t%s\t%s\t%5.3f\n", sh_label, "both", "sys", sys
|
493 |
|
494 | i++
|
495 | }
|
496 | ' $BASE_DIR/outer-wide.tsv
|
497 | }
|
498 |
|
499 | report-times() {
|
500 | head $BASE_DIR/times/*.tsv
|
501 | echo
|
502 | head $BASE_DIR/times/*.times.txt
|
503 | echo
|
504 |
|
505 | inner-long-tsv | tee $BASE_DIR/inner-long.tsv
|
506 | echo
|
507 |
|
508 | tsv-concat $BASE_DIR/times/*.tsv | tee $BASE_DIR/outer-wide.tsv
|
509 | outer-long-tsv | tee $BASE_DIR/outer-long.tsv
|
510 | echo
|
511 |
|
512 | tsv-concat $BASE_DIR/{inner,outer}-long.tsv | tee $BASE_DIR/times-long.tsv
|
513 |
|
514 | compare-times
|
515 | }
|
516 |
|
517 | ### Why is clone() taking longer according to strace?
|
518 |
|
519 | fork-tasks() {
|
520 | echo "bash${TAB}bash"
|
521 | echo "dash${TAB}dash"
|
522 |
|
523 | # Hm this is noisy, but cxx-opt-sh does seem slower
|
524 | echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt/osh"
|
525 | echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt-sh/osh"
|
526 | }
|
527 |
|
528 | measure-fork() {
|
529 | fork-tasks | while read -r sh_label sh_path; do
|
530 | #case $sh_label in bash|dash) continue ;; esac
|
531 |
|
532 | echo "=== $sh_path ==="
|
533 |
|
534 | # Builtin is very fast
|
535 | #time $sh_path -c 'for i in $(seq 100); do true; done'
|
536 |
|
537 | # Hm this is very noisy
|
538 | # TODO use hyperfine?
|
539 | time $sh_path -c 'for i in $(seq 100); do /bin/true; done'
|
540 |
|
541 | case $sh_label in
|
542 | osh)
|
543 | # Oops, we are not symlinking to the .stripped binary!
|
544 | # This is explicitly done for symbols and benchmarking.
|
545 | # Hm does that make it slower then?
|
546 |
|
547 | ls -l -L $sh_path
|
548 | ldd $sh_path
|
549 | ;;
|
550 | esac
|
551 | done
|
552 | }
|
553 |
|
554 | # $ head _tmp/elapsed/*.times.txt
|
555 | # ==> _tmp/elapsed/bash.times.txt <==
|
556 | # 0m0.213s 0m0.477s
|
557 | # 0m8.233s 0m2.931s
|
558 | #
|
559 | # ==> _tmp/elapsed/dash.times.txt <==
|
560 | # 0m0.217s 0m0.463s
|
561 | # 0m8.281s 0m2.922s
|
562 | #
|
563 | # ==> _tmp/elapsed/osh.times.txt <==
|
564 | # 0m0.360s 0m0.720s
|
565 | # 0m8.790s 0m2.960s
|
566 |
|
567 | # shell user time - GC and allocs
|
568 | # shell system time - ???
|
569 | # child user time - ???
|
570 | # TODO: count how many processes this is.
|
571 | # It's more than 500 ms
|
572 | # Is that 500 processes, and 1 ms per process?
|
573 |
|
574 | fork-time() {
|
575 | local osh=_bin/cxx-opt/osh
|
576 |
|
577 | $osh -c 'time for i in {1..1000}; do true; done'
|
578 | echo
|
579 |
|
580 | $osh -c 'time for i in {1..1000}; do ( true ); done'
|
581 | echo
|
582 |
|
583 | # Does this increase fork time or no?
|
584 | # Hm I can only get the forks up to 306ms for 1000, or 300 us
|
585 | # But the HereDocWriter does dup() and so forth
|
586 | $osh -c '
|
587 | echo ysh-parse
|
588 | time for i in {1..40}; do
|
589 | . test/ysh-parse-errors.sh
|
590 | done
|
591 | times
|
592 | time for i in {1..1000}; do
|
593 | ( true )
|
594 | done'
|
595 | echo
|
596 | }
|
597 |
|
598 | "$@"
|