OILS / benchmarks / id.sh View on Github | oilshell.org

441 lines, 223 significant
1#!/usr/bin/env bash
2#
3# Keep track of benchmark data provenance.
4#
5# Usage:
6# benchmarks/id.sh <function name>
7
8set -o nounset
9set -o pipefail
10set -o errexit
11
12REPO_ROOT=$(cd $(dirname $0)/..; pwd)
13readonly REPO_ROOT
14
15source build/common.sh # for $CLANG
16source benchmarks/common.sh
17source test/tsv-lib.sh # tsv-row
18
19print-job-id() {
20 date '+%Y-%m-%d__%H-%M-%S'
21}
22
23# TODO: add benchmark labels/hashes for osh and all other shells
24#
25# Need to archive labels too.
26#
27# TODO: How do I make sure the zsh label is current? Across different
28# machines?
29#
30# What happens when zsh is silently upgraded?
31# I guess before every benchmark, you have to run the ID collection. Man
32# that is a lot of code.
33#
34# Should I make symlinks to the published location?
35#
36# Maybe bash/dash/mksh/zsh should be invoked through a symlink?
37# Every symlink is a shell runtime version, and it has an associated
38# toolchain?
39
40# Platform is ambient?
41# _tmp/
42# shell-id/
43# bash/
44# HASH.txt
45# version.txt
46# dash/
47# HASH.txt
48# version.txt
49# host-id/
50# lisa/
51# HASH.txt
52# cpuinfo.txt
53
54# ../benchmark-data/
55# shell-id/
56# bash-$HASH/
57# osh-$HASH/ # osh-cpython, osh-ovm? osh-opy-ovm? Too many dimensions.
58# # the other shells don't have this?
59# zsh-$HASH/
60# host-id/
61# lisa-$HASH/
62
63_dump-if-exists() {
64 local path=$1
65 local out=$2
66 if ! test -f "$path"; then
67 return
68 fi
69 cat "$path" > $out
70}
71
72#
73# Shell ID
74#
75
76dump-shell-id() {
77 local sh_path=$1
78 local out_dir=$2
79
80 if ! command -v $sh_path >/dev/null; then
81 die "dump-shell-id: Couldn't find $sh_path"
82 fi
83
84 mkdir -p $out_dir
85
86 echo $sh_path > $out_dir/sh-path.txt
87
88 # Add extra repository info for osh.
89 case $sh_path in
90 */osh*)
91 local commit_hash=$out_dir/git-commit-hash.txt
92
93 if test -n "${XSHAR_GIT_COMMIT:-}"; then
94 echo "$XSHAR_GIT_COMMIT" > $commit_hash
95 else
96 local branch
97 branch=$(git rev-parse --abbrev-ref HEAD)
98 echo $branch > $out_dir/git-branch.txt
99 git rev-parse $branch > $commit_hash
100 fi
101 ;;
102 esac
103
104 local sh_name
105 sh_name=$(basename $sh_path)
106
107 case $sh_name in
108 bash|zsh|yash)
109 $sh_path --version > $out_dir/version.txt
110 ;;
111 osh)
112 case $sh_path in
113 *_bin/*/osh)
114 # Doesn't support --version yet
115 ;;
116 *)
117 $sh_path --version > $out_dir/osh-version.txt
118 ;;
119 esac
120 ;;
121 # oils-for-unix|oils-for-unix.stripped)
122 # ;;
123 dash|mksh)
124 # These don't have version strings!
125 dpkg -s $sh_name > $out_dir/dpkg-version.txt
126 ;;
127
128 # not a shell, but useful for benchmarks/compute
129 python2)
130 $sh_path -V 2> $out_dir/version.txt
131 ;;
132 *)
133 die "Invalid shell '$sh_name'"
134 ;;
135 esac
136}
137
138_shell-id-hash() {
139 local src=$1
140
141 local file
142
143 # for shells and Python
144 file=$src/version.txt
145 test -f $file && cat $file
146
147 # Only hash the dimensions we want to keep
148 file=$src/dpkg-version.txt
149 test -f $file && egrep '^Version' $file
150
151 # Interpreter as CPython vs. OVM is what we care about, so
152 # select 'Interpreter:' but not 'Interpreter version:'.
153 # For example, the version is different on Ubuntu Bionic vs. Trusty, but we
154 # ignore that.
155 file=$src/osh-version.txt
156 test -f $file && egrep '^Oil version|^Interpreter:' $file
157
158 # For OSH
159 file=$src/git-commit-hash.txt
160 test -f $file && cat $file
161
162 return 0
163}
164
165publish-shell-id() {
166 ### Copy temp directory to hashed location
167
168 local src=$1 # e.g. _tmp/prov-tmp/osh
169 local dest_base=${2:-../benchmark-data/shell-id} # or _tmp/shell-id
170
171 local sh_path sh_name
172 read sh_path < $src/sh-path.txt
173 sh_name=$(basename $sh_path)
174
175 local hash
176 hash=$(_shell-id-hash $src | md5sum) # not secure, an identifier
177
178 local id="${hash:0:8}"
179 local dest="$dest_base/$sh_name-$id"
180
181 mkdir -p $dest
182 cp --no-target-directory --recursive $src/ $dest/
183
184 echo $hash > $dest/HASH.txt
185
186 log "Published shell ID to $dest"
187
188 echo $id
189}
190
191#
192# Platform ID
193#
194
195# Events that will change the env for a given machine:
196# - kernel upgrade
197# - distro upgrade
198
199# How about ~/git/oilshell/benchmark-data/host-id/lisa-$HASH
200# How to calculate the hash though?
201
202dump-host-id() {
203 local out_dir=${1:-_tmp/host-id/$(hostname)}
204
205 mkdir -p $out_dir
206
207 hostname > $out_dir/hostname.txt
208
209 # does it make sense to do individual fields like -m?
210 # avoid parsing?
211 # We care about the kernel and the CPU architecture.
212 # There is a lot of redundant information there.
213 uname -m > $out_dir/machine.txt
214 # machine
215 { uname --kernel-release
216 uname --kernel-version
217 } > $out_dir/kernel.txt
218
219 _dump-if-exists /etc/lsb-release $out_dir/lsb-release.txt
220
221 # remove the cpu MHz field, which changes a lot
222 grep -i -v 'cpu mhz' /proc/cpuinfo > $out_dir/cpuinfo.txt
223 # mem info doesn't make a difference? I guess it's just nice to check that
224 # it's not swapping. But shouldn't be part of the hash.
225
226 grep '^MemTotal' /proc/meminfo > $out_dir/meminfo.txt
227
228 #head $out_dir/* 1>&2 # don't write to stdout
229}
230
231# There is already concept of the triple?
232# http://wiki.osdev.org/Target_Triplet
233# It's not exactly the same as what we need here, but close.
234
235_host-id-hash() {
236 local src=$1
237
238 # Don't hash CPU or memory
239 #cat $src/cpuinfo.txt
240 #cat $src/hostname.txt # e.g. lisa
241
242 cat $src/machine.txt # e.g. x86_64
243 cat $src/kernel.txt
244
245 # OS
246 local file=$src/lsb-release.txt
247 if test -f $file; then
248 cat $file
249 fi
250
251 return 0
252}
253
254# Writes a short ID to stdout.
255publish-host-id() {
256 local src=$1 # e.g. _tmp/host-id/lisa
257 local dest_base=${2:-../benchmark-data/host-id}
258
259 local name
260 name=$(basename $src)
261
262 local hash
263 hash=$(_host-id-hash $src | md5sum) # not secure, an identifier
264
265 local id="${hash:0:8}"
266 local dest="$dest_base/$name-$id"
267
268 mkdir -p $dest
269 cp --no-target-directory --recursive $src/ $dest/
270
271 echo $hash > $dest/HASH.txt
272
273 log "Published host ID to $dest"
274
275 echo $id
276}
277
278#
279# Compilers
280#
281
282dump-compiler-id() {
283 local cc=$1 # path to the compiler
284 local out_dir=${2:-_tmp/compiler-id/$(basename $cc)}
285
286 mkdir -p $out_dir
287
288 case $cc in
289 */gcc)
290 $cc --version
291 # -v has more details, but they might be overkill.
292 ;;
293 */clang)
294 $cc --version
295 # -v has stuff we don't want
296 ;;
297 esac > $out_dir/version.txt
298}
299
300_compiler-id-hash() {
301 local src=$1
302
303 # Remove some extraneous information from clang.
304 cat $src/version.txt | grep -v InstalledDir
305}
306
307# Writes a short ID to stdout.
308publish-compiler-id() {
309 local src=$1 # e.g. _tmp/compiler-id/clang
310 local dest_base=${2:-../benchmark-data/compiler-id}
311
312 local name=$(basename $src)
313 local hash
314 hash=$(_compiler-id-hash $src | md5sum) # not secure, an identifier
315
316 local id="${hash:0:8}"
317 local dest="$dest_base/$name-$id"
318
319 mkdir -p $dest
320 cp --no-target-directory --recursive $src/ $dest/
321
322 echo $hash > $dest/HASH.txt
323
324 log "Published compiler ID to $dest"
325
326 echo $id
327}
328
329#
330# Table Output
331#
332
333# Writes a table of host and shells to stdout. Writes text files and
334# calculates IDs for them as a side effect.
335#
336# The table can be passed to other benchmarks to ensure that their provenance
337# is recorded.
338
339shell-provenance-2() {
340 ### Write to _tmp/provenance.{txt,tsv} and $out_dir/{shell,host-id}
341
342 local maybe_host=$1 # if it exists, it overrides the host
343 local job_id=$2
344 local out_dir=$3
345 shift 3
346
347 # log "*** shell-provenance"
348
349 mkdir -p _tmp/provenance
350
351 local host_name
352 if test -n "$maybe_host"; then # label is often 'no-host'
353 host_name=$maybe_host
354 else
355 host_name=$(hostname)
356 fi
357
358 log "*** $maybe_host $host_name $job_id $out_dir"
359
360 local tmp_dir=_tmp/prov-tmp/$host_name
361 dump-host-id $tmp_dir
362
363 local host_hash
364 host_hash=$(publish-host-id $tmp_dir "$out_dir/host-id")
365 local shell_hash
366
367 local out_txt=_tmp/provenance.txt # Legacy text file
368 echo -n '' > $out_txt # trunacte, no header
369
370 local out_tsv=_tmp/provenance.tsv
371 tsv-row job_id host_name host_hash sh_path shell_hash > $out_tsv
372
373 local i=0
374
375 for sh_path in "$@"; do
376 # There can be two different OSH
377
378 tmp_dir=_tmp/prov-tmp/shell-$i
379 i=$((i + 1))
380
381 dump-shell-id $sh_path $tmp_dir
382
383 # writes to ../benchmark-data or _tmp/provenance
384 shell_hash=$(publish-shell-id $tmp_dir "$out_dir/shell-id")
385
386 # note: filter-provenance depends on $4 being $sh_path
387 # APPEND to txt
388 echo "$job_id $host_name $host_hash $sh_path $shell_hash" >> $out_txt
389
390 tsv-row "$job_id" "$host_name" "$host_hash" "$sh_path" "$shell_hash" >> $out_tsv
391 done
392
393 log "Wrote $out_txt and $out_tsv"
394}
395
396compiler-provenance() {
397 local job_id
398 job_id=$(print-job-id)
399
400 local host
401 host=$(hostname)
402
403 # Filename
404 local out=_tmp/provenance/${host}.${job_id}.compiler-provenance.txt
405
406 local tmp_dir=_tmp/host-id/$host
407 dump-host-id $tmp_dir
408
409 local host_hash
410 host_hash=$(publish-host-id $tmp_dir)
411
412 local compiler_hash
413
414 # gcc is assumed to be in the $PATH.
415 for compiler_path in $(which gcc) $CLANG; do
416 local name=$(basename $compiler_path)
417
418 tmp_dir=_tmp/compiler-id/$name
419 dump-compiler-id $compiler_path $tmp_dir
420
421 compiler_hash=$(publish-compiler-id $tmp_dir)
422
423 echo "$job_id $host $host_hash $compiler_path $compiler_hash"
424 done > $out
425
426 log "Wrote $out"
427
428 # Return value used in command sub
429 echo $out
430}
431
432out-param() {
433 declare -n out=$1
434
435 out=returned
436}
437
438if test $(basename $0) = 'id.sh'; then
439 "$@"
440fi
441