benchmarks/id.sh

OILS / benchmarks / id.sh View on Github | oilshell.org

442 lines, 221 significant

1	#!/usr/bin/env bash
2	#
3	# Keep track of benchmark data provenance.
4	#
5	# Usage:
6	# benchmarks/id.sh <function name>
7
8	set -o nounset
9	set -o pipefail
10	set -o errexit
11
12	REPO_ROOT=$(cd $(dirname $0)/..; pwd)
13	readonly REPO_ROOT
14
15	source build/common.sh # for $CLANG
16	source benchmarks/common.sh
17	source soil/common.sh # find-dir-html
18	source test/tsv-lib.sh # tsv-row
19
20	print-job-id() {
21	date '+%Y-%m-%d__%H-%M-%S'
22	}
23
24	# TODO: add benchmark labels/hashes for osh and all other shells
25	#
26	# Need to archive labels too.
27	#
28	# TODO: How do I make sure the zsh label is current? Across different
29	# machines?
30	#
31	# What happens when zsh is silently upgraded?
32	# I guess before every benchmark, you have to run the ID collection. Man
33	# that is a lot of code.
34	#
35	# Should I make symlinks to the published location?
36	#
37	# Maybe bash/dash/mksh/zsh should be invoked through a symlink?
38	# Every symlink is a shell runtime version, and it has an associated
39	# toolchain?
40
41	# Platform is ambient?
42	# _tmp/
43	# shell-id/
44	# bash/
45	# HASH.txt
46	# version.txt
47	# dash/
48	# HASH.txt
49	# version.txt
50	# host-id/
51	# lisa/
52	# HASH.txt
53	# cpuinfo.txt
54
55	# ../benchmark-data/
56	# shell-id/
57	# bash-$HASH/
58	# osh-$HASH/ # osh-cpython, osh-ovm? osh-opy-ovm? Too many dimensions.
59	# # the other shells don't have this?
60	# zsh-$HASH/
61	# host-id/
62	# lisa-$HASH/
63
64	_dump-if-exists() {
65	local path=$1
66	local out=$2
67	if ! test -f "$path"; then
68	return
69	fi
70	cat "$path" > $out
71	}
72
73	#
74	# Shell ID
75	#
76
77	dump-shell-id() {
78	local sh_path=$1
79	local out_dir=$2
80
81	if ! command -v $sh_path >/dev/null; then
82	die "dump-shell-id: Couldn't find $sh_path"
83	fi
84
85	mkdir -p $out_dir
86
87	echo $sh_path > $out_dir/sh-path.txt
88
89	# Add extra repository info for osh.
90	case $sh_path in
91	/osh)
92	local branch
93	branch=$(git rev-parse --abbrev-ref HEAD)
94	echo $branch > $out_dir/git-branch.txt
95	git rev-parse $branch > $out_dir/git-commit-hash.txt
96	;;
97	esac
98
99	local sh_name
100	sh_name=$(basename $sh_path)
101
102	case $sh_name in
103	bash\|zsh\|yash)
104	$sh_path --version > $out_dir/version.txt
105	;;
106	osh)
107	case $sh_path in
108	_bin//osh)
109	# Doesn't support --version yet
110	;;
111	*)
112	$sh_path --version > $out_dir/osh-version.txt
113	;;
114	esac
115	;;
116	# oils-for-unix\|oils-for-unix.stripped)
117	# ;;
118	dash\|mksh)
119	# These don't have version strings!
120	dpkg -s $sh_name > $out_dir/dpkg-version.txt
121	;;
122
123	# not a shell, but useful for benchmarks/compute
124	python2)
125	$sh_path -V 2> $out_dir/version.txt
126	;;
127	*)
128	die "Invalid shell '$sh_name'"
129	;;
130	esac
131	}
132
133	_shell-id-hash() {
134	local src=$1
135
136	local file
137
138	# for shells and Python
139	file=$src/version.txt
140	test -f $file && cat $file
141
142	# Only hash the dimensions we want to keep
143	file=$src/dpkg-version.txt
144	test -f $file && egrep '^Version' $file
145
146	# Interpreter as CPython vs. OVM is what we care about, so
147	# select 'Interpreter:' but not 'Interpreter version:'.
148	# For example, the version is different on Ubuntu Bionic vs. Trusty, but we
149	# ignore that.
150	file=$src/osh-version.txt
151	test -f $file && egrep '^Oil version\|^Interpreter:' $file
152
153	# For OSH
154	file=$src/git-commit-hash.txt
155	test -f $file && cat $file
156
157	return 0
158	}
159
160	publish-shell-id() {
161	### Copy temp directory to hashed location
162
163	local src=$1 # e.g. _tmp/prov-tmp/osh
164	local dest_base=${2:-../benchmark-data/shell-id} # or _tmp/shell-id
165
166	local sh_path sh_name
167	read sh_path < $src/sh-path.txt
168	sh_name=$(basename $sh_path)
169
170	local hash
171	hash=$(_shell-id-hash $src \| md5sum) # not secure, an identifier
172
173	local id="${hash:0:8}"
174	local dest="$dest_base/$sh_name-$id"
175
176	mkdir -p $dest
177	cp --no-target-directory --recursive $src/ $dest/
178
179	echo $hash > $dest/HASH.txt
180
181	# for .wwz file
182	find-dir-html "$dest"
183
184	log "Published shell ID to $dest"
185
186	echo $id
187	}
188
189	#
190	# Platform ID
191	#
192
193	# Events that will change the env for a given machine:
194	# - kernel upgrade
195	# - distro upgrade
196
197	# How about ~/git/oilshell/benchmark-data/host-id/lisa-$HASH
198	# How to calculate the hash though?
199
200	dump-host-id() {
201	local out_dir=${1:-_tmp/host-id/$(hostname)}
202
203	mkdir -p $out_dir
204
205	hostname > $out_dir/hostname.txt
206
207	# does it make sense to do individual fields like -m?
208	# avoid parsing?
209	# We care about the kernel and the CPU architecture.
210	# There is a lot of redundant information there.
211	uname -m > $out_dir/machine.txt
212	# machine
213	{ uname --kernel-release
214	uname --kernel-version
215	} > $out_dir/kernel.txt
216
217	_dump-if-exists /etc/lsb-release $out_dir/lsb-release.txt
218
219	# remove the cpu MHz field, which changes a lot
220	grep -i -v 'cpu mhz' /proc/cpuinfo > $out_dir/cpuinfo.txt
221	# mem info doesn't make a difference? I guess it's just nice to check that
222	# it's not swapping. But shouldn't be part of the hash.
223
224	grep '^MemTotal' /proc/meminfo > $out_dir/meminfo.txt
225
226	#head $out_dir/* 1>&2 # don't write to stdout
227	}
228
229	# There is already concept of the triple?
230	# http://wiki.osdev.org/Target_Triplet
231	# It's not exactly the same as what we need here, but close.
232
233	_host-id-hash() {
234	local src=$1
235
236	# Don't hash CPU or memory
237	#cat $src/cpuinfo.txt
238	#cat $src/hostname.txt # e.g. lisa
239
240	cat $src/machine.txt # e.g. x86_64
241	cat $src/kernel.txt
242
243	# OS
244	local file=$src/lsb-release.txt
245	if test -f $file; then
246	cat $file
247	fi
248
249	return 0
250	}
251
252	# Writes a short ID to stdout.
253	publish-host-id() {
254	local src=$1 # e.g. _tmp/host-id/lisa
255	local dest_base=${2:-../benchmark-data/host-id}
256
257	local name
258	name=$(basename $src)
259
260	local hash
261	hash=$(_host-id-hash $src \| md5sum) # not secure, an identifier
262
263	local id="${hash:0:8}"
264	local dest="$dest_base/$name-$id"
265
266	mkdir -p $dest
267	cp --no-target-directory --recursive $src/ $dest/
268
269	echo $hash > $dest/HASH.txt
270
271	# for .wwz file
272	find-dir-html "$dest"
273
274	log "Published host ID to $dest"
275
276	echo $id
277	}
278
279	#
280	# Compilers
281	#
282
283	dump-compiler-id() {
284	local cc=$1 # path to the compiler
285	local out_dir=${2:-_tmp/compiler-id/$(basename $cc)}
286
287	mkdir -p $out_dir
288
289	case $cc in
290	*/gcc)
291	$cc --version
292	# -v has more details, but they might be overkill.
293	;;
294	*/clang)
295	$cc --version
296	# -v has stuff we don't want
297	;;
298	esac > $out_dir/version.txt
299	}
300
301	_compiler-id-hash() {
302	local src=$1
303
304	# Remove some extraneous information from clang.
305	cat $src/version.txt \| grep -v InstalledDir
306	}
307
308	# Writes a short ID to stdout.
309	publish-compiler-id() {
310	local src=$1 # e.g. _tmp/compiler-id/clang
311	local dest_base=${2:-../benchmark-data/compiler-id}
312
313	local name=$(basename $src)
314	local hash
315	hash=$(_compiler-id-hash $src \| md5sum) # not secure, an identifier
316
317	local id="${hash:0:8}"
318	local dest="$dest_base/$name-$id"
319
320	mkdir -p $dest
321	cp --no-target-directory --recursive $src/ $dest/
322
323	echo $hash > $dest/HASH.txt
324
325	log "Published compiler ID to $dest"
326
327	echo $id
328	}
329
330	#
331	# Table Output
332	#
333
334	# Writes a table of host and shells to stdout. Writes text files and
335	# calculates IDs for them as a side effect.
336	#
337	# The table can be passed to other benchmarks to ensure that their provenance
338	# is recorded.
339
340	shell-provenance-2() {
341	### Write to _tmp/provenance.{txt,tsv} and $out_dir/{shell,host-id}
342
343	local maybe_host=$1 # if it exists, it overrides the host
344	local job_id=$2
345	local out_dir=$3
346	shift 3
347
348	# log "*** shell-provenance"
349
350	mkdir -p _tmp/provenance
351
352	local host_name
353	if test -n "$maybe_host"; then # label is often 'no-host'
354	host_name=$maybe_host
355	else
356	host_name=$(hostname)
357	fi
358
359	log "*** $maybe_host $host_name $job_id $out_dir"
360
361	local tmp_dir=_tmp/prov-tmp/$host_name
362	dump-host-id $tmp_dir
363
364	local host_hash
365	host_hash=$(publish-host-id $tmp_dir "$out_dir/host-id")
366	local shell_hash
367
368	local out_txt=_tmp/provenance.txt # Legacy text file
369	echo -n '' > $out_txt # trunacte, no header
370
371	local out_tsv=_tmp/provenance.tsv
372	tsv-row job_id host_name host_hash sh_path shell_hash > $out_tsv
373
374	local i=0
375
376	for sh_path in "$@"; do
377	# There can be two different OSH
378
379	tmp_dir=_tmp/prov-tmp/shell-$i
380	i=$((i + 1))
381
382	dump-shell-id $sh_path $tmp_dir
383
384	# writes to ../benchmark-data or _tmp/provenance
385	shell_hash=$(publish-shell-id $tmp_dir "$out_dir/shell-id")
386
387	# note: filter-provenance depends on $4 being $sh_path
388	# APPEND to txt
389	echo "$job_id $host_name $host_hash $sh_path $shell_hash" >> $out_txt
390
391	tsv-row "$job_id" "$host_name" "$host_hash" "$sh_path" "$shell_hash" >> $out_tsv
392	done
393
394	log "Wrote $out_txt and $out_tsv"
395	}
396
397	compiler-provenance() {
398	local job_id
399	job_id=$(print-job-id)
400
401	local host
402	host=$(hostname)
403
404	# Filename
405	local out=_tmp/provenance/${host}.${job_id}.compiler-provenance.txt
406
407	local tmp_dir=_tmp/host-id/$host
408	dump-host-id $tmp_dir
409
410	local host_hash
411	host_hash=$(publish-host-id $tmp_dir)
412
413	local compiler_hash
414
415	# gcc is assumed to be in the $PATH.
416	for compiler_path in $(which gcc) $CLANG; do
417	local name=$(basename $compiler_path)
418
419	tmp_dir=_tmp/compiler-id/$name
420	dump-compiler-id $compiler_path $tmp_dir
421
422	compiler_hash=$(publish-compiler-id $tmp_dir)
423
424	echo "$job_id $host $host_hash $compiler_path $compiler_hash"
425	done > $out
426
427	log "Wrote $out"
428
429	# Return value used in command sub
430	echo $out
431	}
432
433	out-param() {
434	declare -n out=$1
435
436	out=returned
437	}
438
439	if test $(basename $0) = 'id.sh'; then
440	"$@"
441	fi
442