benchmarks/id.sh

OILS / benchmarks / id.sh View on Github | oilshell.org

441 lines, 223 significant

1	#!/usr/bin/env bash
2	#
3	# Keep track of benchmark data provenance.
4	#
5	# Usage:
6	# benchmarks/id.sh <function name>
7
8	set -o nounset
9	set -o pipefail
10	set -o errexit
11
12	REPO_ROOT=$(cd $(dirname $0)/..; pwd)
13	readonly REPO_ROOT
14
15	source build/common.sh # for $CLANG
16	source benchmarks/common.sh
17	source test/tsv-lib.sh # tsv-row
18
19	print-job-id() {
20	date '+%Y-%m-%d__%H-%M-%S'
21	}
22
23	# TODO: add benchmark labels/hashes for osh and all other shells
24	#
25	# Need to archive labels too.
26	#
27	# TODO: How do I make sure the zsh label is current? Across different
28	# machines?
29	#
30	# What happens when zsh is silently upgraded?
31	# I guess before every benchmark, you have to run the ID collection. Man
32	# that is a lot of code.
33	#
34	# Should I make symlinks to the published location?
35	#
36	# Maybe bash/dash/mksh/zsh should be invoked through a symlink?
37	# Every symlink is a shell runtime version, and it has an associated
38	# toolchain?
39
40	# Platform is ambient?
41	# _tmp/
42	# shell-id/
43	# bash/
44	# HASH.txt
45	# version.txt
46	# dash/
47	# HASH.txt
48	# version.txt
49	# host-id/
50	# lisa/
51	# HASH.txt
52	# cpuinfo.txt
53
54	# ../benchmark-data/
55	# shell-id/
56	# bash-$HASH/
57	# osh-$HASH/ # osh-cpython, osh-ovm? osh-opy-ovm? Too many dimensions.
58	# # the other shells don't have this?
59	# zsh-$HASH/
60	# host-id/
61	# lisa-$HASH/
62
63	_dump-if-exists() {
64	local path=$1
65	local out=$2
66	if ! test -f "$path"; then
67	return
68	fi
69	cat "$path" > $out
70	}
71
72	#
73	# Shell ID
74	#
75
76	dump-shell-id() {
77	local sh_path=$1
78	local out_dir=$2
79
80	if ! command -v $sh_path >/dev/null; then
81	die "dump-shell-id: Couldn't find $sh_path"
82	fi
83
84	mkdir -p $out_dir
85
86	echo $sh_path > $out_dir/sh-path.txt
87
88	# Add extra repository info for osh.
89	case $sh_path in
90	/osh)
91	local commit_hash=$out_dir/git-commit-hash.txt
92
93	if test -n "${XSHAR_GIT_COMMIT:-}"; then
94	echo "$XSHAR_GIT_COMMIT" > $commit_hash
95	else
96	local branch
97	branch=$(git rev-parse --abbrev-ref HEAD)
98	echo $branch > $out_dir/git-branch.txt
99	git rev-parse $branch > $commit_hash
100	fi
101	;;
102	esac
103
104	local sh_name
105	sh_name=$(basename $sh_path)
106
107	case $sh_name in
108	bash\|zsh\|yash)
109	$sh_path --version > $out_dir/version.txt
110	;;
111	osh)
112	case $sh_path in
113	_bin//osh)
114	# Doesn't support --version yet
115	;;
116	*)
117	$sh_path --version > $out_dir/osh-version.txt
118	;;
119	esac
120	;;
121	# oils-for-unix\|oils-for-unix.stripped)
122	# ;;
123	dash\|mksh)
124	# These don't have version strings!
125	dpkg -s $sh_name > $out_dir/dpkg-version.txt
126	;;
127
128	# not a shell, but useful for benchmarks/compute
129	python2)
130	$sh_path -V 2> $out_dir/version.txt
131	;;
132	*)
133	die "Invalid shell '$sh_name'"
134	;;
135	esac
136	}
137
138	_shell-id-hash() {
139	local src=$1
140
141	local file
142
143	# for shells and Python
144	file=$src/version.txt
145	test -f $file && cat $file
146
147	# Only hash the dimensions we want to keep
148	file=$src/dpkg-version.txt
149	test -f $file && egrep '^Version' $file
150
151	# Interpreter as CPython vs. OVM is what we care about, so
152	# select 'Interpreter:' but not 'Interpreter version:'.
153	# For example, the version is different on Ubuntu Bionic vs. Trusty, but we
154	# ignore that.
155	file=$src/osh-version.txt
156	test -f $file && egrep '^Oil version\|^Interpreter:' $file
157
158	# For OSH
159	file=$src/git-commit-hash.txt
160	test -f $file && cat $file
161
162	return 0
163	}
164
165	publish-shell-id() {
166	### Copy temp directory to hashed location
167
168	local src=$1 # e.g. _tmp/prov-tmp/osh
169	local dest_base=${2:-../benchmark-data/shell-id} # or _tmp/shell-id
170
171	local sh_path sh_name
172	read sh_path < $src/sh-path.txt
173	sh_name=$(basename $sh_path)
174
175	local hash
176	hash=$(_shell-id-hash $src \| md5sum) # not secure, an identifier
177
178	local id="${hash:0:8}"
179	local dest="$dest_base/$sh_name-$id"
180
181	mkdir -p $dest
182	cp --no-target-directory --recursive $src/ $dest/
183
184	echo $hash > $dest/HASH.txt
185
186	log "Published shell ID to $dest"
187
188	echo $id
189	}
190
191	#
192	# Platform ID
193	#
194
195	# Events that will change the env for a given machine:
196	# - kernel upgrade
197	# - distro upgrade
198
199	# How about ~/git/oilshell/benchmark-data/host-id/lisa-$HASH
200	# How to calculate the hash though?
201
202	dump-host-id() {
203	local out_dir=${1:-_tmp/host-id/$(hostname)}
204
205	mkdir -p $out_dir
206
207	hostname > $out_dir/hostname.txt
208
209	# does it make sense to do individual fields like -m?
210	# avoid parsing?
211	# We care about the kernel and the CPU architecture.
212	# There is a lot of redundant information there.
213	uname -m > $out_dir/machine.txt
214	# machine
215	{ uname --kernel-release
216	uname --kernel-version
217	} > $out_dir/kernel.txt
218
219	_dump-if-exists /etc/lsb-release $out_dir/lsb-release.txt
220
221	# remove the cpu MHz field, which changes a lot
222	grep -i -v 'cpu mhz' /proc/cpuinfo > $out_dir/cpuinfo.txt
223	# mem info doesn't make a difference? I guess it's just nice to check that
224	# it's not swapping. But shouldn't be part of the hash.
225
226	grep '^MemTotal' /proc/meminfo > $out_dir/meminfo.txt
227
228	#head $out_dir/* 1>&2 # don't write to stdout
229	}
230
231	# There is already concept of the triple?
232	# http://wiki.osdev.org/Target_Triplet
233	# It's not exactly the same as what we need here, but close.
234
235	_host-id-hash() {
236	local src=$1
237
238	# Don't hash CPU or memory
239	#cat $src/cpuinfo.txt
240	#cat $src/hostname.txt # e.g. lisa
241
242	cat $src/machine.txt # e.g. x86_64
243	cat $src/kernel.txt
244
245	# OS
246	local file=$src/lsb-release.txt
247	if test -f $file; then
248	cat $file
249	fi
250
251	return 0
252	}
253
254	# Writes a short ID to stdout.
255	publish-host-id() {
256	local src=$1 # e.g. _tmp/host-id/lisa
257	local dest_base=${2:-../benchmark-data/host-id}
258
259	local name
260	name=$(basename $src)
261
262	local hash
263	hash=$(_host-id-hash $src \| md5sum) # not secure, an identifier
264
265	local id="${hash:0:8}"
266	local dest="$dest_base/$name-$id"
267
268	mkdir -p $dest
269	cp --no-target-directory --recursive $src/ $dest/
270
271	echo $hash > $dest/HASH.txt
272
273	log "Published host ID to $dest"
274
275	echo $id
276	}
277
278	#
279	# Compilers
280	#
281
282	dump-compiler-id() {
283	local cc=$1 # path to the compiler
284	local out_dir=${2:-_tmp/compiler-id/$(basename $cc)}
285
286	mkdir -p $out_dir
287
288	case $cc in
289	*/gcc)
290	$cc --version
291	# -v has more details, but they might be overkill.
292	;;
293	*/clang)
294	$cc --version
295	# -v has stuff we don't want
296	;;
297	esac > $out_dir/version.txt
298	}
299
300	_compiler-id-hash() {
301	local src=$1
302
303	# Remove some extraneous information from clang.
304	cat $src/version.txt \| grep -v InstalledDir
305	}
306
307	# Writes a short ID to stdout.
308	publish-compiler-id() {
309	local src=$1 # e.g. _tmp/compiler-id/clang
310	local dest_base=${2:-../benchmark-data/compiler-id}
311
312	local name=$(basename $src)
313	local hash
314	hash=$(_compiler-id-hash $src \| md5sum) # not secure, an identifier
315
316	local id="${hash:0:8}"
317	local dest="$dest_base/$name-$id"
318
319	mkdir -p $dest
320	cp --no-target-directory --recursive $src/ $dest/
321
322	echo $hash > $dest/HASH.txt
323
324	log "Published compiler ID to $dest"
325
326	echo $id
327	}
328
329	#
330	# Table Output
331	#
332
333	# Writes a table of host and shells to stdout. Writes text files and
334	# calculates IDs for them as a side effect.
335	#
336	# The table can be passed to other benchmarks to ensure that their provenance
337	# is recorded.
338
339	shell-provenance-2() {
340	### Write to _tmp/provenance.{txt,tsv} and $out_dir/{shell,host-id}
341
342	local maybe_host=$1 # if it exists, it overrides the host
343	local job_id=$2
344	local out_dir=$3
345	shift 3
346
347	# log "*** shell-provenance"
348
349	mkdir -p _tmp/provenance
350
351	local host_name
352	if test -n "$maybe_host"; then # label is often 'no-host'
353	host_name=$maybe_host
354	else
355	host_name=$(hostname)
356	fi
357
358	log "*** $maybe_host $host_name $job_id $out_dir"
359
360	local tmp_dir=_tmp/prov-tmp/$host_name
361	dump-host-id $tmp_dir
362
363	local host_hash
364	host_hash=$(publish-host-id $tmp_dir "$out_dir/host-id")
365	local shell_hash
366
367	local out_txt=_tmp/provenance.txt # Legacy text file
368	echo -n '' > $out_txt # trunacte, no header
369
370	local out_tsv=_tmp/provenance.tsv
371	tsv-row job_id host_name host_hash sh_path shell_hash > $out_tsv
372
373	local i=0
374
375	for sh_path in "$@"; do
376	# There can be two different OSH
377
378	tmp_dir=_tmp/prov-tmp/shell-$i
379	i=$((i + 1))
380
381	dump-shell-id $sh_path $tmp_dir
382
383	# writes to ../benchmark-data or _tmp/provenance
384	shell_hash=$(publish-shell-id $tmp_dir "$out_dir/shell-id")
385
386	# note: filter-provenance depends on $4 being $sh_path
387	# APPEND to txt
388	echo "$job_id $host_name $host_hash $sh_path $shell_hash" >> $out_txt
389
390	tsv-row "$job_id" "$host_name" "$host_hash" "$sh_path" "$shell_hash" >> $out_tsv
391	done
392
393	log "Wrote $out_txt and $out_tsv"
394	}
395
396	compiler-provenance() {
397	local job_id
398	job_id=$(print-job-id)
399
400	local host
401	host=$(hostname)
402
403	# Filename
404	local out=_tmp/provenance/${host}.${job_id}.compiler-provenance.txt
405
406	local tmp_dir=_tmp/host-id/$host
407	dump-host-id $tmp_dir
408
409	local host_hash
410	host_hash=$(publish-host-id $tmp_dir)
411
412	local compiler_hash
413
414	# gcc is assumed to be in the $PATH.
415	for compiler_path in $(which gcc) $CLANG; do
416	local name=$(basename $compiler_path)
417
418	tmp_dir=_tmp/compiler-id/$name
419	dump-compiler-id $compiler_path $tmp_dir
420
421	compiler_hash=$(publish-compiler-id $tmp_dir)
422
423	echo "$job_id $host $host_hash $compiler_path $compiler_hash"
424	done > $out
425
426	log "Wrote $out"
427
428	# Return value used in command sub
429	echo $out
430	}
431
432	out-param() {
433	declare -n out=$1
434
435	out=returned
436	}
437
438	if test $(basename $0) = 'id.sh'; then
439	"$@"
440	fi
441