OILS / benchmarks / id.sh View on Github | oilshell.org

442 lines, 221 significant
1#!/usr/bin/env bash
2#
3# Keep track of benchmark data provenance.
4#
5# Usage:
6# benchmarks/id.sh <function name>
7
8set -o nounset
9set -o pipefail
10set -o errexit
11
12REPO_ROOT=$(cd $(dirname $0)/..; pwd)
13readonly REPO_ROOT
14
15source build/common.sh # for $CLANG
16source benchmarks/common.sh
17source soil/common.sh # find-dir-html
18source test/tsv-lib.sh # tsv-row
19
20print-job-id() {
21 date '+%Y-%m-%d__%H-%M-%S'
22}
23
24# TODO: add benchmark labels/hashes for osh and all other shells
25#
26# Need to archive labels too.
27#
28# TODO: How do I make sure the zsh label is current? Across different
29# machines?
30#
31# What happens when zsh is silently upgraded?
32# I guess before every benchmark, you have to run the ID collection. Man
33# that is a lot of code.
34#
35# Should I make symlinks to the published location?
36#
37# Maybe bash/dash/mksh/zsh should be invoked through a symlink?
38# Every symlink is a shell runtime version, and it has an associated
39# toolchain?
40
41# Platform is ambient?
42# _tmp/
43# shell-id/
44# bash/
45# HASH.txt
46# version.txt
47# dash/
48# HASH.txt
49# version.txt
50# host-id/
51# lisa/
52# HASH.txt
53# cpuinfo.txt
54
55# ../benchmark-data/
56# shell-id/
57# bash-$HASH/
58# osh-$HASH/ # osh-cpython, osh-ovm? osh-opy-ovm? Too many dimensions.
59# # the other shells don't have this?
60# zsh-$HASH/
61# host-id/
62# lisa-$HASH/
63
64_dump-if-exists() {
65 local path=$1
66 local out=$2
67 if ! test -f "$path"; then
68 return
69 fi
70 cat "$path" > $out
71}
72
73#
74# Shell ID
75#
76
77dump-shell-id() {
78 local sh_path=$1
79 local out_dir=$2
80
81 if ! command -v $sh_path >/dev/null; then
82 die "dump-shell-id: Couldn't find $sh_path"
83 fi
84
85 mkdir -p $out_dir
86
87 echo $sh_path > $out_dir/sh-path.txt
88
89 # Add extra repository info for osh.
90 case $sh_path in
91 */osh*)
92 local branch
93 branch=$(git rev-parse --abbrev-ref HEAD)
94 echo $branch > $out_dir/git-branch.txt
95 git rev-parse $branch > $out_dir/git-commit-hash.txt
96 ;;
97 esac
98
99 local sh_name
100 sh_name=$(basename $sh_path)
101
102 case $sh_name in
103 bash|zsh|yash)
104 $sh_path --version > $out_dir/version.txt
105 ;;
106 osh)
107 case $sh_path in
108 *_bin/*/osh)
109 # Doesn't support --version yet
110 ;;
111 *)
112 $sh_path --version > $out_dir/osh-version.txt
113 ;;
114 esac
115 ;;
116 # oils-for-unix|oils-for-unix.stripped)
117 # ;;
118 dash|mksh)
119 # These don't have version strings!
120 dpkg -s $sh_name > $out_dir/dpkg-version.txt
121 ;;
122
123 # not a shell, but useful for benchmarks/compute
124 python2)
125 $sh_path -V 2> $out_dir/version.txt
126 ;;
127 *)
128 die "Invalid shell '$sh_name'"
129 ;;
130 esac
131}
132
133_shell-id-hash() {
134 local src=$1
135
136 local file
137
138 # for shells and Python
139 file=$src/version.txt
140 test -f $file && cat $file
141
142 # Only hash the dimensions we want to keep
143 file=$src/dpkg-version.txt
144 test -f $file && egrep '^Version' $file
145
146 # Interpreter as CPython vs. OVM is what we care about, so
147 # select 'Interpreter:' but not 'Interpreter version:'.
148 # For example, the version is different on Ubuntu Bionic vs. Trusty, but we
149 # ignore that.
150 file=$src/osh-version.txt
151 test -f $file && egrep '^Oil version|^Interpreter:' $file
152
153 # For OSH
154 file=$src/git-commit-hash.txt
155 test -f $file && cat $file
156
157 return 0
158}
159
160publish-shell-id() {
161 ### Copy temp directory to hashed location
162
163 local src=$1 # e.g. _tmp/prov-tmp/osh
164 local dest_base=${2:-../benchmark-data/shell-id} # or _tmp/shell-id
165
166 local sh_path sh_name
167 read sh_path < $src/sh-path.txt
168 sh_name=$(basename $sh_path)
169
170 local hash
171 hash=$(_shell-id-hash $src | md5sum) # not secure, an identifier
172
173 local id="${hash:0:8}"
174 local dest="$dest_base/$sh_name-$id"
175
176 mkdir -p $dest
177 cp --no-target-directory --recursive $src/ $dest/
178
179 echo $hash > $dest/HASH.txt
180
181 # for .wwz file
182 find-dir-html "$dest"
183
184 log "Published shell ID to $dest"
185
186 echo $id
187}
188
189#
190# Platform ID
191#
192
193# Events that will change the env for a given machine:
194# - kernel upgrade
195# - distro upgrade
196
197# How about ~/git/oilshell/benchmark-data/host-id/lisa-$HASH
198# How to calculate the hash though?
199
200dump-host-id() {
201 local out_dir=${1:-_tmp/host-id/$(hostname)}
202
203 mkdir -p $out_dir
204
205 hostname > $out_dir/hostname.txt
206
207 # does it make sense to do individual fields like -m?
208 # avoid parsing?
209 # We care about the kernel and the CPU architecture.
210 # There is a lot of redundant information there.
211 uname -m > $out_dir/machine.txt
212 # machine
213 { uname --kernel-release
214 uname --kernel-version
215 } > $out_dir/kernel.txt
216
217 _dump-if-exists /etc/lsb-release $out_dir/lsb-release.txt
218
219 # remove the cpu MHz field, which changes a lot
220 grep -i -v 'cpu mhz' /proc/cpuinfo > $out_dir/cpuinfo.txt
221 # mem info doesn't make a difference? I guess it's just nice to check that
222 # it's not swapping. But shouldn't be part of the hash.
223
224 grep '^MemTotal' /proc/meminfo > $out_dir/meminfo.txt
225
226 #head $out_dir/* 1>&2 # don't write to stdout
227}
228
229# There is already concept of the triple?
230# http://wiki.osdev.org/Target_Triplet
231# It's not exactly the same as what we need here, but close.
232
233_host-id-hash() {
234 local src=$1
235
236 # Don't hash CPU or memory
237 #cat $src/cpuinfo.txt
238 #cat $src/hostname.txt # e.g. lisa
239
240 cat $src/machine.txt # e.g. x86_64
241 cat $src/kernel.txt
242
243 # OS
244 local file=$src/lsb-release.txt
245 if test -f $file; then
246 cat $file
247 fi
248
249 return 0
250}
251
252# Writes a short ID to stdout.
253publish-host-id() {
254 local src=$1 # e.g. _tmp/host-id/lisa
255 local dest_base=${2:-../benchmark-data/host-id}
256
257 local name
258 name=$(basename $src)
259
260 local hash
261 hash=$(_host-id-hash $src | md5sum) # not secure, an identifier
262
263 local id="${hash:0:8}"
264 local dest="$dest_base/$name-$id"
265
266 mkdir -p $dest
267 cp --no-target-directory --recursive $src/ $dest/
268
269 echo $hash > $dest/HASH.txt
270
271 # for .wwz file
272 find-dir-html "$dest"
273
274 log "Published host ID to $dest"
275
276 echo $id
277}
278
279#
280# Compilers
281#
282
283dump-compiler-id() {
284 local cc=$1 # path to the compiler
285 local out_dir=${2:-_tmp/compiler-id/$(basename $cc)}
286
287 mkdir -p $out_dir
288
289 case $cc in
290 */gcc)
291 $cc --version
292 # -v has more details, but they might be overkill.
293 ;;
294 */clang)
295 $cc --version
296 # -v has stuff we don't want
297 ;;
298 esac > $out_dir/version.txt
299}
300
301_compiler-id-hash() {
302 local src=$1
303
304 # Remove some extraneous information from clang.
305 cat $src/version.txt | grep -v InstalledDir
306}
307
308# Writes a short ID to stdout.
309publish-compiler-id() {
310 local src=$1 # e.g. _tmp/compiler-id/clang
311 local dest_base=${2:-../benchmark-data/compiler-id}
312
313 local name=$(basename $src)
314 local hash
315 hash=$(_compiler-id-hash $src | md5sum) # not secure, an identifier
316
317 local id="${hash:0:8}"
318 local dest="$dest_base/$name-$id"
319
320 mkdir -p $dest
321 cp --no-target-directory --recursive $src/ $dest/
322
323 echo $hash > $dest/HASH.txt
324
325 log "Published compiler ID to $dest"
326
327 echo $id
328}
329
330#
331# Table Output
332#
333
334# Writes a table of host and shells to stdout. Writes text files and
335# calculates IDs for them as a side effect.
336#
337# The table can be passed to other benchmarks to ensure that their provenance
338# is recorded.
339
340shell-provenance-2() {
341 ### Write to _tmp/provenance.{txt,tsv} and $out_dir/{shell,host-id}
342
343 local maybe_host=$1 # if it exists, it overrides the host
344 local job_id=$2
345 local out_dir=$3
346 shift 3
347
348 # log "*** shell-provenance"
349
350 mkdir -p _tmp/provenance
351
352 local host_name
353 if test -n "$maybe_host"; then # label is often 'no-host'
354 host_name=$maybe_host
355 else
356 host_name=$(hostname)
357 fi
358
359 log "*** $maybe_host $host_name $job_id $out_dir"
360
361 local tmp_dir=_tmp/prov-tmp/$host_name
362 dump-host-id $tmp_dir
363
364 local host_hash
365 host_hash=$(publish-host-id $tmp_dir "$out_dir/host-id")
366 local shell_hash
367
368 local out_txt=_tmp/provenance.txt # Legacy text file
369 echo -n '' > $out_txt # trunacte, no header
370
371 local out_tsv=_tmp/provenance.tsv
372 tsv-row job_id host_name host_hash sh_path shell_hash > $out_tsv
373
374 local i=0
375
376 for sh_path in "$@"; do
377 # There can be two different OSH
378
379 tmp_dir=_tmp/prov-tmp/shell-$i
380 i=$((i + 1))
381
382 dump-shell-id $sh_path $tmp_dir
383
384 # writes to ../benchmark-data or _tmp/provenance
385 shell_hash=$(publish-shell-id $tmp_dir "$out_dir/shell-id")
386
387 # note: filter-provenance depends on $4 being $sh_path
388 # APPEND to txt
389 echo "$job_id $host_name $host_hash $sh_path $shell_hash" >> $out_txt
390
391 tsv-row "$job_id" "$host_name" "$host_hash" "$sh_path" "$shell_hash" >> $out_tsv
392 done
393
394 log "Wrote $out_txt and $out_tsv"
395}
396
397compiler-provenance() {
398 local job_id
399 job_id=$(print-job-id)
400
401 local host
402 host=$(hostname)
403
404 # Filename
405 local out=_tmp/provenance/${host}.${job_id}.compiler-provenance.txt
406
407 local tmp_dir=_tmp/host-id/$host
408 dump-host-id $tmp_dir
409
410 local host_hash
411 host_hash=$(publish-host-id $tmp_dir)
412
413 local compiler_hash
414
415 # gcc is assumed to be in the $PATH.
416 for compiler_path in $(which gcc) $CLANG; do
417 local name=$(basename $compiler_path)
418
419 tmp_dir=_tmp/compiler-id/$name
420 dump-compiler-id $compiler_path $tmp_dir
421
422 compiler_hash=$(publish-compiler-id $tmp_dir)
423
424 echo "$job_id $host $host_hash $compiler_path $compiler_hash"
425 done > $out
426
427 log "Wrote $out"
428
429 # Return value used in command sub
430 echo $out
431}
432
433out-param() {
434 declare -n out=$1
435
436 out=returned
437}
438
439if test $(basename $0) = 'id.sh'; then
440 "$@"
441fi
442