OILS / benchmarks / id.sh View on Github | oilshell.org

448 lines, 226 significant
1#!/usr/bin/env bash
2#
3# Keep track of benchmark data provenance.
4#
5# Usage:
6# benchmarks/id.sh <function name>
7
8set -o nounset
9set -o pipefail
10set -o errexit
11
12REPO_ROOT=$(cd $(dirname $0)/..; pwd)
13readonly REPO_ROOT
14
15source build/common.sh # for $CLANG
16source benchmarks/common.sh
17source soil/common.sh # find-dir-html
18source test/tsv-lib.sh # tsv-row
19
20print-job-id() {
21 date '+%Y-%m-%d__%H-%M-%S'
22}
23
24# TODO: add benchmark labels/hashes for osh and all other shells
25#
26# Need to archive labels too.
27#
28# TODO: How do I make sure the zsh label is current? Across different
29# machines?
30#
31# What happens when zsh is silently upgraded?
32# I guess before every benchmark, you have to run the ID collection. Man
33# that is a lot of code.
34#
35# Should I make symlinks to the published location?
36#
37# Maybe bash/dash/mksh/zsh should be invoked through a symlink?
38# Every symlink is a shell runtime version, and it has an associated
39# toolchain?
40
41# Platform is ambient?
42# _tmp/
43# shell-id/
44# bash/
45# HASH.txt
46# version.txt
47# dash/
48# HASH.txt
49# version.txt
50# host-id/
51# lisa/
52# HASH.txt
53# cpuinfo.txt
54
55# ../benchmark-data/
56# shell-id/
57# bash-$HASH/
58# osh-$HASH/ # osh-cpython, osh-ovm? osh-opy-ovm? Too many dimensions.
59# # the other shells don't have this?
60# zsh-$HASH/
61# host-id/
62# lisa-$HASH/
63
64_dump-if-exists() {
65 local path=$1
66 local out=$2
67 if ! test -f "$path"; then
68 return
69 fi
70 cat "$path" > $out
71}
72
73#
74# Shell ID
75#
76
77dump-shell-id() {
78 local sh_path=$1
79 local out_dir=$2
80
81 if ! command -v $sh_path >/dev/null; then
82 die "dump-shell-id: Couldn't find $sh_path"
83 fi
84
85 mkdir -p $out_dir
86
87 echo $sh_path > $out_dir/sh-path.txt
88
89 # Add extra repository info for osh.
90 case $sh_path in
91 */osh*)
92 local commit_hash=$out_dir/git-commit-hash.txt
93
94 if test -n "${XSHAR_GIT_COMMIT:-}"; then
95 echo "$XSHAR_GIT_COMMIT" > $commit_hash
96 else
97 local branch
98 branch=$(git rev-parse --abbrev-ref HEAD)
99 echo $branch > $out_dir/git-branch.txt
100 git rev-parse $branch > $commit_hash
101 fi
102 ;;
103 esac
104
105 local sh_name
106 sh_name=$(basename $sh_path)
107
108 case $sh_name in
109 bash|zsh|yash)
110 $sh_path --version > $out_dir/version.txt
111 ;;
112 osh)
113 case $sh_path in
114 *_bin/*/osh)
115 # Doesn't support --version yet
116 ;;
117 *)
118 $sh_path --version > $out_dir/osh-version.txt
119 ;;
120 esac
121 ;;
122 # oils-for-unix|oils-for-unix.stripped)
123 # ;;
124 dash|mksh)
125 # These don't have version strings!
126 dpkg -s $sh_name > $out_dir/dpkg-version.txt
127 ;;
128
129 # not a shell, but useful for benchmarks/compute
130 python2)
131 $sh_path -V 2> $out_dir/version.txt
132 ;;
133 *)
134 die "Invalid shell '$sh_name'"
135 ;;
136 esac
137}
138
139_shell-id-hash() {
140 local src=$1
141
142 local file
143
144 # for shells and Python
145 file=$src/version.txt
146 test -f $file && cat $file
147
148 # Only hash the dimensions we want to keep
149 file=$src/dpkg-version.txt
150 test -f $file && egrep '^Version' $file
151
152 # Interpreter as CPython vs. OVM is what we care about, so
153 # select 'Interpreter:' but not 'Interpreter version:'.
154 # For example, the version is different on Ubuntu Bionic vs. Trusty, but we
155 # ignore that.
156 file=$src/osh-version.txt
157 test -f $file && egrep '^Oil version|^Interpreter:' $file
158
159 # For OSH
160 file=$src/git-commit-hash.txt
161 test -f $file && cat $file
162
163 return 0
164}
165
166publish-shell-id() {
167 ### Copy temp directory to hashed location
168
169 local src=$1 # e.g. _tmp/prov-tmp/osh
170 local dest_base=${2:-../benchmark-data/shell-id} # or _tmp/shell-id
171
172 local sh_path sh_name
173 read sh_path < $src/sh-path.txt
174 sh_name=$(basename $sh_path)
175
176 local hash
177 hash=$(_shell-id-hash $src | md5sum) # not secure, an identifier
178
179 local id="${hash:0:8}"
180 local dest="$dest_base/$sh_name-$id"
181
182 mkdir -p $dest
183 cp --no-target-directory --recursive $src/ $dest/
184
185 echo $hash > $dest/HASH.txt
186
187 # for .wwz file
188 find-dir-html "$dest"
189
190 log "Published shell ID to $dest"
191
192 echo $id
193}
194
195#
196# Platform ID
197#
198
199# Events that will change the env for a given machine:
200# - kernel upgrade
201# - distro upgrade
202
203# How about ~/git/oilshell/benchmark-data/host-id/lisa-$HASH
204# How to calculate the hash though?
205
206dump-host-id() {
207 local out_dir=${1:-_tmp/host-id/$(hostname)}
208
209 mkdir -p $out_dir
210
211 hostname > $out_dir/hostname.txt
212
213 # does it make sense to do individual fields like -m?
214 # avoid parsing?
215 # We care about the kernel and the CPU architecture.
216 # There is a lot of redundant information there.
217 uname -m > $out_dir/machine.txt
218 # machine
219 { uname --kernel-release
220 uname --kernel-version
221 } > $out_dir/kernel.txt
222
223 _dump-if-exists /etc/lsb-release $out_dir/lsb-release.txt
224
225 # remove the cpu MHz field, which changes a lot
226 grep -i -v 'cpu mhz' /proc/cpuinfo > $out_dir/cpuinfo.txt
227 # mem info doesn't make a difference? I guess it's just nice to check that
228 # it's not swapping. But shouldn't be part of the hash.
229
230 grep '^MemTotal' /proc/meminfo > $out_dir/meminfo.txt
231
232 #head $out_dir/* 1>&2 # don't write to stdout
233}
234
235# There is already concept of the triple?
236# http://wiki.osdev.org/Target_Triplet
237# It's not exactly the same as what we need here, but close.
238
239_host-id-hash() {
240 local src=$1
241
242 # Don't hash CPU or memory
243 #cat $src/cpuinfo.txt
244 #cat $src/hostname.txt # e.g. lisa
245
246 cat $src/machine.txt # e.g. x86_64
247 cat $src/kernel.txt
248
249 # OS
250 local file=$src/lsb-release.txt
251 if test -f $file; then
252 cat $file
253 fi
254
255 return 0
256}
257
258# Writes a short ID to stdout.
259publish-host-id() {
260 local src=$1 # e.g. _tmp/host-id/lisa
261 local dest_base=${2:-../benchmark-data/host-id}
262
263 local name
264 name=$(basename $src)
265
266 local hash
267 hash=$(_host-id-hash $src | md5sum) # not secure, an identifier
268
269 local id="${hash:0:8}"
270 local dest="$dest_base/$name-$id"
271
272 mkdir -p $dest
273 cp --no-target-directory --recursive $src/ $dest/
274
275 echo $hash > $dest/HASH.txt
276
277 # for .wwz file
278 find-dir-html "$dest"
279
280 log "Published host ID to $dest"
281
282 echo $id
283}
284
285#
286# Compilers
287#
288
289dump-compiler-id() {
290 local cc=$1 # path to the compiler
291 local out_dir=${2:-_tmp/compiler-id/$(basename $cc)}
292
293 mkdir -p $out_dir
294
295 case $cc in
296 */gcc)
297 $cc --version
298 # -v has more details, but they might be overkill.
299 ;;
300 */clang)
301 $cc --version
302 # -v has stuff we don't want
303 ;;
304 esac > $out_dir/version.txt
305}
306
307_compiler-id-hash() {
308 local src=$1
309
310 # Remove some extraneous information from clang.
311 cat $src/version.txt | grep -v InstalledDir
312}
313
314# Writes a short ID to stdout.
315publish-compiler-id() {
316 local src=$1 # e.g. _tmp/compiler-id/clang
317 local dest_base=${2:-../benchmark-data/compiler-id}
318
319 local name=$(basename $src)
320 local hash
321 hash=$(_compiler-id-hash $src | md5sum) # not secure, an identifier
322
323 local id="${hash:0:8}"
324 local dest="$dest_base/$name-$id"
325
326 mkdir -p $dest
327 cp --no-target-directory --recursive $src/ $dest/
328
329 echo $hash > $dest/HASH.txt
330
331 log "Published compiler ID to $dest"
332
333 echo $id
334}
335
336#
337# Table Output
338#
339
340# Writes a table of host and shells to stdout. Writes text files and
341# calculates IDs for them as a side effect.
342#
343# The table can be passed to other benchmarks to ensure that their provenance
344# is recorded.
345
346shell-provenance-2() {
347 ### Write to _tmp/provenance.{txt,tsv} and $out_dir/{shell,host-id}
348
349 local maybe_host=$1 # if it exists, it overrides the host
350 local job_id=$2
351 local out_dir=$3
352 shift 3
353
354 # log "*** shell-provenance"
355
356 mkdir -p _tmp/provenance
357
358 local host_name
359 if test -n "$maybe_host"; then # label is often 'no-host'
360 host_name=$maybe_host
361 else
362 host_name=$(hostname)
363 fi
364
365 log "*** $maybe_host $host_name $job_id $out_dir"
366
367 local tmp_dir=_tmp/prov-tmp/$host_name
368 dump-host-id $tmp_dir
369
370 local host_hash
371 host_hash=$(publish-host-id $tmp_dir "$out_dir/host-id")
372 local shell_hash
373
374 local out_txt=_tmp/provenance.txt # Legacy text file
375 echo -n '' > $out_txt # trunacte, no header
376
377 local out_tsv=_tmp/provenance.tsv
378 tsv-row job_id host_name host_hash sh_path shell_hash > $out_tsv
379
380 local i=0
381
382 for sh_path in "$@"; do
383 # There can be two different OSH
384
385 tmp_dir=_tmp/prov-tmp/shell-$i
386 i=$((i + 1))
387
388 dump-shell-id $sh_path $tmp_dir
389
390 # writes to ../benchmark-data or _tmp/provenance
391 shell_hash=$(publish-shell-id $tmp_dir "$out_dir/shell-id")
392
393 # note: filter-provenance depends on $4 being $sh_path
394 # APPEND to txt
395 echo "$job_id $host_name $host_hash $sh_path $shell_hash" >> $out_txt
396
397 tsv-row "$job_id" "$host_name" "$host_hash" "$sh_path" "$shell_hash" >> $out_tsv
398 done
399
400 log "Wrote $out_txt and $out_tsv"
401}
402
403compiler-provenance() {
404 local job_id
405 job_id=$(print-job-id)
406
407 local host
408 host=$(hostname)
409
410 # Filename
411 local out=_tmp/provenance/${host}.${job_id}.compiler-provenance.txt
412
413 local tmp_dir=_tmp/host-id/$host
414 dump-host-id $tmp_dir
415
416 local host_hash
417 host_hash=$(publish-host-id $tmp_dir)
418
419 local compiler_hash
420
421 # gcc is assumed to be in the $PATH.
422 for compiler_path in $(which gcc) $CLANG; do
423 local name=$(basename $compiler_path)
424
425 tmp_dir=_tmp/compiler-id/$name
426 dump-compiler-id $compiler_path $tmp_dir
427
428 compiler_hash=$(publish-compiler-id $tmp_dir)
429
430 echo "$job_id $host $host_hash $compiler_path $compiler_hash"
431 done > $out
432
433 log "Wrote $out"
434
435 # Return value used in command sub
436 echo $out
437}
438
439out-param() {
440 declare -n out=$1
441
442 out=returned
443}
444
445if test $(basename $0) = 'id.sh'; then
446 "$@"
447fi
448