From: Amir Hanania Date: Tue, 16 Feb 2016 20:47:04 +0000 (-0800) Subject: dtest/dapltest: add new automated test suite for HOST to MIC testing X-Git-Tag: dapl-2.1.9-1~16 X-Git-Url: https://openfabrics.org/gitweb/?a=commitdiff_plain;h=d1b5d4836ad6e89f5ec739596dea502953a0bdcf;p=~ardavis%2Fdapl.git dtest/dapltest: add new automated test suite for HOST to MIC testing Signed-off-by: Arlin Davis Signed-off-by: Amir Hanania --- diff --git a/Makefile.am b/Makefile.am index 7adaf43..ea99e6e 100755 --- a/Makefile.am +++ b/Makefile.am @@ -723,6 +723,7 @@ EXTRA_DIST = dat/common/dat_dictionary.h \ dapl.spec.in \ mpxyd.init.in \ $(man_MANS) \ + test/dtest/scripts/dtest_suite.sh \ test/dapltest/scripts/cl.sh \ test/dapltest/scripts/srv.sh \ test/dapltest/scripts/regress.sh \ diff --git a/test/scripts/dtest_suite.sh b/test/scripts/dtest_suite.sh new file mode 100755 index 0000000..d6a6713 --- /dev/null +++ b/test/scripts/dtest_suite.sh @@ -0,0 +1,1117 @@ +#!/bin/sh +# +# Copyright (c) 2016 Intel Corporation. All rights reserved. +# +# This Software is licensed under one of the following licenses: +# +# 1) under the terms of the "Common Public License 1.0" a copy of which is +# in the file LICENSE.txt in the root directory. The license is also +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/cpl.php. +# +# 2) under the terms of the "The BSD License" a copy of which is in the file +# LICENSE2.txt in the root directory. The license is also available from +# the Open Source Initiative, see +# http://www.opensource.org/licenses/bsd-license.php. +# +# 3) under the terms of the "GNU General Public License (GPL) Version 2" a +# copy of which is in the file LICENSE3.txt in the root directory. The +# license is also available from the Open Source Initiative, see +# http://www.opensource.org/licenses/gpl-license.php. +# +# Licensee has the right to choose one of the above licenses. +# +# Redistributions of source code must retain the above copyright +# notice and one of the license notices. +# +# Redistributions in binary form must reproduce both the above copyright +# notice, one of the license notices in the documentation +# and/or other materials provided with the distribution. +# +# Test Suite to test uDAPL Providers and CCL Proxy on MICs and Hosts +# +# Sample Usage, all providers, one loop, fast: +# +# ./dtest_suite.sh -P ALL -l 1 -f +# + +### --- user input section --- ### +server_list="cst-kc1 cst-kc1-mic0 cst-kc1-mic1" +client_list="cst-kc2 cst-kc2-mic0 cst-kc2-mic1 cst-kc1 cst-kc1-mic0 cst-kc1-mic1" +### --- dtest test cases fine tune zone --- ### +# Note: value zero indicacte dtest will use the test default value +b_options="0 1 4096" +u_options="0 1" +w_options="0 1" +S_options="0 9" +B_options="0 1" +D_options="0 1" +W_options="0" +# test defaults +def_provider="ofa-v2-mlx4_0-1u" +dat_conf="/etc/dat.conf" +### --- End of user input section --- ### + +script_version="1.05" + +# History log +# 1.05 - Disable data validation mode when using scif provider +# From: Amir Hanania +# 1.04 - Add data validation for dtest ping pong +# Add option not to use CPU mask in performance test +# From: Amir Hanania +# 1.03 - Add dapl tests +# From: Amir Hanania +# 1.02 - Change performane test to use dtest -W case for latency. +# Note: You must have a dtesr version that support -W to run performane test. +# From: Amir Hanania +# 1.01 - Add multi provider test +# From: Amir Hanania +# 1.00 - Initial Version +# From: Amir Hanania +# Test script to test dapl. +# Run dtest test in multiple options. +# Notes: +# 1. For performance test. Same dtest configuration is used twice. +# Once with -W for latency and once without for BW. +# + +user_provider=$def_provider +server_client_list=$server_list" "$client_list +host_list=`for i in $server_client_list; do echo $i | awk -F "-mic" '{ print $1 }'; done | sort | uniq` +provider_search_debug=0 +dapl_test_user_input="y" +ran_one_dapltest=0 +dapl_test_rep_max=100 +dapl_test_rep=$dapl_test_rep_max +mfo_test=0 +fast_test=0 +fast_test_str="" +perf_test=0 +no_inline_data=0 +debug_info=0 +v_for_test="" +user_srting="" +ctrl_c=0 +runs=0 +max_run_time=0 +dapl_mtu=0 +loops=0 +log_file_dir="dtest_perf_logs" +log_file="$log_file_dir/dtest_performance_" +unidirection_test=0 +cpu_mask="no_cpu_mask" +user_b_options="none" +dog_file=/tmp/dog.log +dog_ser=/tmp/dog.ser +dog_cli=/tmp/dog.cli +i=1 +while [ $i -lt 5000000 ]; do + b_options_for_perf_test+=" $i" + i=$(( $i*2 )) +done +mkdir -p $log_file_dir + +control_c() +# run if user hits control-c +{ + echo -en "\n*** ^c ***\n" + if [ $ctrl_c -ne 0 ]; then + echo -ne "\n*** Forced EXIT! ***\n\n" + for s in $server_list; do + ssh root@$s "killall dtest" > /dev/null 2>&1 + ssh root@$s "killall dapltest" > /dev/null 2>&1 + done + for c in $client_list; do + ssh root@$c "killall dtest" > /dev/null 2>&1 + ssh root@$c "killall dapltest" > /dev/null 2>&1 + done + exit 1 + fi + let "ctrl_c+=1" + echo -en "\n*** Will break after this test case ***\n\n" +} + +# trap keyboard interrupt (control-c) +trap control_c SIGINT + +exit_control() +{ + # if dog killed us. Clean up the dtest still working. + for s in $server_list; do + ssh root@$c "killall dtest" > /dev/null 2>&1 + done + for c in $client_list; do + ssh root@$c "killall dtest" > /dev/null 2>&1 + done + + echo "2" > $dog_file + sleep 2 + #kill dog + # jobs -p | xargs kill +} +# trap exit to kill dog when script exit +#trap 'jobs -p | xargs kill' EXIT +trap exit_control EXIT + +function dog(){ + while true; do + val=`cat $dog_file` + if [ $val -eq 2 ]; then + exit + fi + if [ $val -eq 1 ]; then + server=`cat $dog_ser` + client=`cat $dog_cli` + server_err=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c ERR"` + client_err=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c ERR"` + server_fail=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c FAIL"` + client_fail=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c FAIL"` + if [ $server_err -gt 0 ] || [ $client_err -gt 0 ] || [ $server_fail -gt 0 ] || [ $client_fail -gt 0 ]; then + sleep 2 + echo -e "\n\n\twatchdog bark - validation test failed\n\n" + killall ${0##*/} + fi + echo -n "." + fi + sleep 1 + done +} + +function wait_for_server_to_be_ready(){ + i=99 + echo -ne "Waiting to servers to come up... $i \r" + until [ $i -eq 0 ]; do + up=0 + file_found="NOT found" + ssh root@$server [ -f /tmp/dtest_ser_run.log ] && file_found="file found" + if [ "$file_found" == "file found" ]; then + up=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c waiting"` + fi + if [ $up -eq 1 ]; then + break; + fi + let "i = i - 1" + echo -ne "Waiting to servers to come up... $i \r" + sleep 0.1 + done +} + + +u=0 +w=0 +B=0 +b=0 +S=0 +D=0 + +function testcase(){ + # Setting the dtest options + if [ $u -ne 0 ]; then + u_for_test="-u" + else + u_for_test="" + fi + if [ $w -ne 0 ]; then + w_for_test="-w" + else + w_for_test="" + fi + if [ $B -ne 0 ]; then + B_for_test="-B $B" + else + B_for_test="" + fi + if [ $b -ne 0 ]; then + b_for_test="-b $b" + else + b_for_test="" + fi + if [ $S -ne 0 ]; then + S_for_test="-S $S" + else + S_for_test="" + fi + if [ $W -ne 0 ]; then + W_for_test="-W" + else + W_for_test="" + fi + if [ $D -ne 0 ]; then + if [ $do_not_validate_data_with_scif -eq 1 ]; then + return 0 + fi + D_for_test="-D -a -B 10" + else + D_for_test="" + fi + + if [ $ctrl_c -ne 0 ]; then + echo -ne "\n*** Stop test due to ctrl c ***\n\n" + exit 1 + fi + + # in case the prev test failed. The files will be still there for debug. Delete them for the new run. + ssh root@$server "rm /tmp/dtest_ser_run.log" > /dev/null 2>&1 + ssh root@$client "rm /tmp/dtest_cli_run.log" > /dev/null 2>&1 + + if [ $D -eq 1 ]; then + support_data_validation + if [ $dtest_support_data_val -ne 1 ]; then + return + fi + fi + + #Start the server + echo "----------------------------------------------------------" + echo "Test case: $W_for_test $D_for_test $u_for_test $w_for_test $B_for_test $b_for_test $S_for_test $v_for_test $user_srting" + echo -ne "Start $taskset_4_server dtest -P $provider server $server\r" + ssh root@$server "$export_str $taskset_4_server dtest -P $provider $W_for_test $u_for_test $w_for_test $B_for_test $b_for_test $S_for_test $v_for_test $user_srting $D_for_test >& /tmp/dtest_ser_run.log" & + ser_pid=$! + + # Wait for server to be ready + wait_for_server_to_be_ready + + if [ $i -eq 0 ]; then + echo $server dtest failed - did not start + ssh root@$server "killall dtest" + ssh root@$client "killall dtest" + exit 1 + fi + + # Start client + echo -ne "Start $taskset_4_client dtest -P $provider client \r" + ssh root@$client "$export_str $taskset_4_client dtest -P $provider -h $server $W_for_test $u_for_test $w_for_test $B_for_test $b_for_test $S_for_test $v_for_test $user_srting $D_for_test >& /tmp/dtest_cli_run.log" & + cli_pid=$! + + if [ $D -eq 1 ]; then + echo $server > $dog_ser + echo $client > $dog_cli + echo "1" > $dog_file + fi + + # Wait for Server and Client to be done + wait $ser_pid $cli_pid + + if [ $D -eq 1 ]; then + echo "0" > $dog_file + fi + + # Check results from log files + server_pass=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c PASSED"` + client_pass=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c PASSED"` + server_err=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c ERR"` + client_err=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c ERR"` + do_exit=0 + if [ $ctrl_c -ne 0 ]; then + ssh root@$server "killall -9 dtest" > /dev/null 2>&1 + ssh root@$client "killall -9 dtest" > /dev/null 2>&1 + do_exit=1 + fi + + if [ $server_pass -ne 1 ] || [ $server_err -ne 0 ]; then + echo "****** ERROR - $server server failed (with $client client) *******" + echo " log file: /tmp/dtest_ser_run.log on $server" + do_exit=1 + fi + + if [ $client_pass -ne 1 ] || [ $client_err -ne 0 ]; then + echo "****** ERROR - $client client failed (with $server server) *******" + echo " log file: /tmp/dtest_cli_run.log on $client" + do_exit=1 + fi + + if [ $do_exit -eq 1 ]; then + echo + exit 1 + fi + + # Print to screen or file the results if needed + if [ $perf_test -eq 1 ]; then + echo -ne " \r" + if [ $fast_test -eq 1 ]; then + if [ $W -ne 0 ]; then + # second run is latency test called with -W + lat=`ssh root@$client cat /tmp/dtest_cli_run.log | grep PingPong | awk -F "latency " '{print $2}' | awk -F " us" '{ print $1 }'` + res="$lat, Tx size=$res" + echo "latency: $res" + echo $res >> $log_file + else + # First test for BW + res=`ssh root@$client cat /tmp/dtest_cli_run.log | grep direction | awk -F "00 x " '{ print $2 }'` + fi + else + if [ $W -ne 0 ]; then + # second run is latency test called with -W + lat=`ssh root@$client cat /tmp/dtest_cli_run.log | grep PingPong | awk -F "latency " '{print $2}' | awk -F " us" '{ print $1 }'` + echo -e "Byte size: $b\t\tlatency: $lat\t\tBW: $res" + res=`echo $res | awk -F " MB" '{ print $1 }'` + res=$(printf "%15s" $res) + lat=$(printf "%10s" $lat) + echo -e "$b\t\t$lat\t\t$res" >> $log_file + else + # First test for BW + res=`ssh root@$client cat /tmp/dtest_cli_run.log | grep direction | awk -F "00 x $b, " '{ print $2 }'` + fi + fi + fi + ssh root@$server "rm /tmp/dtest_ser_run.log" + ssh root@$client "rm /tmp/dtest_cli_run.log" + + echo "Test case passed " + + read -t 0.01 -n 1 -s u_input + ret=$? + if [ $ret -eq 0 ] && [ "$u_input" == "i" ]; then + print_round_info + fi + + return 0 +} + + +function wait_for_it(){ + max_wait=900 + i=$max_wait + sleep_for=0.1 + test_start_time=`date +%s` + until [ $i -eq 0 ]; do + echo -n "." + sleep $sleep_for + up=`ssh root@$wait_for_it_machine cat $wait_for_it_file | grep -c "$wait_for_it_string"` + if [ $up -eq 1 ]; then + break; + fi + let "i = i - 1" + if [ $ctrl_c -ne 0 ]; then + i=0 + fi + if [ $i -eq $(( $max_wait - 20 )) ]; then + sleep_for=1 + fi + if [ $i -eq $(( $max_wait - 40 )) ]; then + sleep_for=3 + fi + done + + if [ $i -eq 0 ]; then + if [ $ctrl_c -ne 0 ]; then + echo -ne "\n\t*** Stop test due to ctrl c ***\n\n" + else + echo " failed" + echo -e "\n\n\tDid not find $wait_for_it_string string on machine: $wait_for_it_machine at file $wait_for_it_file - EXIT\n\n" + fi + ssh root@$server killall dapltest > /dev/null 2>&1 + ssh root@$client killall dapltest > /dev/null 2>&1 + exit + fi + test_end_time=`date +%s` + test_run_time=$(($test_end_time-$test_start_time)) + echo " done in $test_run_time sec" +} + + +function print_round_info(){ + now=`date +%s` + run_time=$(($now-$start_time)) + ss=$(($run_time%60)) + mm=$(($run_time/60)) + mm=$(($mm%60)) + hh=$(($run_time/3600)) + echo "**************************************************************" + echo -e "\tin round $runs - $hh h $mm m $ss s" + echo "**************************************************************" +} + + +# Check if client and server dtest support data validation +function support_data_validation() { + dtest_support_data_val=0 + + ssh root@$server "dtest -U >& /tmp/dtest_ser_run.log" + ssh root@$client "dtest -U >& /tmp/dtest_cli_run.log" + sleep .1 + ser_is_valid=`ssh root@$server cat /tmp/dtest_ser_run.log | grep -c "validate data"` + if [ $ser_is_valid -ne 1 ]; then + return 0 + fi + cli_is_valid=`ssh root@$client cat /tmp/dtest_cli_run.log | grep -c "validate data"` + if [ $cli_is_valid -ne 1 ]; then + return 0 + fi + dtest_support_data_val=1 +} + + +# Run dtest in all data size ping pong test with data validation mode between client and server +function server_client_data_validation_test(){ + + echo -e "\n\n\n\t**** dtest data validation test\t\tprovider: $provider\t\tserver: $server $taskset_4_server\t\tclient: $client $taskset_4_client ****\n" + support_data_validation + if [ $dtest_support_data_val -ne 1 ]; then + echo -e "\t**** $client or $server dtest does not support data validation - skipping ****" + return + fi + + echo -e " Start $taskset_4_server dtest -P $provider -D -a on server $server" + ssh root@$server "$export_str $taskset_4_server dtest -P $provider -D -a -B 100 >& /tmp/dtest_ser_run.log" & + ser_pid=$! + wait_for_server_to_be_ready + + echo -e " Start $taskset_4_client dtest -P $provider -D -a on client $client" + ssh root@$client "$export_str $taskset_4_client dtest -P $provider -h $server -D -a -B 100 >& /tmp/dtest_cli_run.log" & + cli_pid=$! + # just wait a bit for files on server and clien be ready before waking up the dog + sleep 1 + + echo $server > $dog_ser + echo $client > $dog_cli + echo "1" > $dog_file + + # Wait for Server and Client to be done + wait $ser_pid $cli_pid + + echo "0" > $dog_file + echo + # Check results from log files + server_pass=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c PASSED"` + client_pass=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c PASSED"` + server_err=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c ERR"` + client_err=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c ERR"` + do_exit=0 + if [ $ctrl_c -ne 0 ]; then + ssh root@$server "killall -9 dtest" > /dev/null 2>&1 + ssh root@$client "killall -9 dtest" > /dev/null 2>&1 + do_exit=1 + fi + + if [ $server_pass -ne 1 ] || [ $server_err -ne 0 ]; then + echo "****** ERROR - $server server failed (with $client client) *******" + echo " log file: /tmp/dtest_ser_run.log on $server" + do_exit=1 + fi + + if [ $client_pass -ne 1 ] || [ $client_err -ne 0 ]; then + echo "****** ERROR - $client client failed (with $server server) *******" + echo " log file: /tmp/dtest_cli_run.log on $client" + do_exit=1 + fi + + if [ $do_exit -eq 1 ]; then + echo + exit 1 + fi + + echo -e "\n\tdtest data validation test\t\tserver: $server\t\tclient: $client\t\tprovider: $provider\t\tTEST PASSED\n\n" + +} + + +# Run dapltest between client and server +function server_client_dapl_test(){ + ofa_post="" + dapl_test_rep=$dapl_test_rep_max + if [ $ctrl_c -ne 0 ]; then + echo -ne "\n*** Stop test due to ctrl c ***\n\n" + exit 1 + fi + + echo "----------------------------------------------------------" + echo -ne "\t**** dapltest\t\tprovider: $provider\t\tserver: $server\t\tclient: $client " + + # in case the prev test failed. The files will be still there for debug. Delete them for the new run. + ssh root@$server "rm /tmp/dapltest_ser_run.log" > /dev/null 2>&1 + ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1 + + # 1. skip if roc + # 2. check that provider is ofa or scm + is_roe=`echo $provider | grep -c roe` + if [ $is_roe -eq 1 ]; then + good_provider_for_dapltest=0 + echo -e " - provider $provider not supported - skipping ****" + echo "----------------------------------------------------------" + return 0 + fi + is_ofa=`ssh root@$server cat $dat_conf | grep $provider | grep -c libdaplofa` + is_scm=`ssh root@$server cat $dat_conf | grep $provider | grep -c libdaploscm` + if [ $is_ofa -eq 0 ] && [ $is_scm -eq 0 ]; then + good_provider_for_dapltest=0 + echo -e " - provider $provider not supported - skipping ****" + echo "----------------------------------------------------------" + return 0 + fi + if [ $is_ofa -eq 1 ]; then + dat_line=`ssh root@$server cat $dat_conf | grep $provider` + ofa_post=`echo $dat_line | grep lofa | awk '{ print $1 }' | awk -F "ofa-v2" '{ print $2 }'` + fi + ran_one_dapltest=1 + + # start server + wait_for_it_machine=$server + wait_for_it_file="/tmp/dapltest_ser_run.log" + wait_for_it_string="Dapltest: Service Point Ready" + echo -e " ****\n----------------------------------------------------------" + echo -e "dapltest\tprovider: $provider\tserver: $server\tclient: $client" + echo -ne "start dapltest server..." + ssh root@$server "dapltest -T S -D $provider >& /tmp/dapltest_ser_run.log" & + wait_for_it + + # tests + wait_for_it_machine=$client + wait_for_it_file="/tmp/dapltest_cli_run.log" + wait_for_it_string="Total WQE" + # test 1 + echo -ne "start dapltest client test 1 ..." + ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 1 -w 1 client SR 256 server SR 256 >& /tmp/dapltest_cli_run.log" & + wait_for_it + + if [ $fast_test -eq 0 ]; then + # test 2 + if [ $dapl_test_rep -ne 1 ] && [ $test_run_time -ge 4 ]; then + dapl_test_rep=$(($dapl_test_rep/$test_run_time/8)) + if [ $dapl_test_rep -eq 0 ]; then + dapl_test_rep=1 + fi + echo Reduce rep to $dapl_test_rep + fi + echo -ne "start dapltest client test 2 ..." + ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1 + ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 1 -w 1 client SR 256 server RW 4096 server SR 256 >& /tmp/dapltest_cli_run.log" & + wait_for_it + + # test 3 + echo -ne "start dapltest client test 3 ..." + ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1 + ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 1 -w 1 client SR 256 server RR 4096 server SR 256 >& /tmp/dapltest_cli_run.log" & + wait_for_it + + # test 4 + echo -ne "start dapltest client test 4 ..." + ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1 + ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 1 -w 1 client SR 256 server RW 4096 server SR 256 client SR 256 server RW 4096 server SR 256 client SR 4096 server SR 256 >& /tmp/dapltest_cli_run.log" & + wait_for_it + + # test 5 + if [ $dapl_test_rep -ne 1 ] && [ $test_run_time -ge 2 ]; then + dapl_test_rep=$(($dapl_test_rep/8)) + if [ $dapl_test_rep -eq 0 ]; then + dapl_test_rep=1 + fi + echo Reduce rep to $dapl_test_rep + fi + echo -ne "start dapltest client test 5 ..." + ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1 + ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 1 -w 8 client SR 256 server RW 4096 server SR 256 client SR 256 server RW 4096 server SR 256 client SR 4096 server SR 256 >& /tmp/dapltest_cli_run.log" & + wait_for_it + + if [ $dapl_test_rep -ne 1 ] && [ $test_run_time -ge 2 ]; then + dapl_test_rep=$(($dapl_test_rep/4)) + if [ $dapl_test_rep -eq 0 ]; then + dapl_test_rep=1 + fi + echo Reduce rep to $dapl_test_rep + fi + # test 6 + echo -ne "start dapltest client test 6 ..." + ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1 + ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 4 -w 8 client SR 256 server RW 4096 server SR 256 client SR 256 server RW 4096 server SR 256 client SR 4096 server SR 256 >& /tmp/dapltest_cli_run.log" & + wait_for_it + fi + + # stop server + echo -n "stop dapltest server..." + ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1 + ssh root@$client "dapltest -T Q -s $server$ofa_post -D $provider >& /tmp/dapltest_cli_run.log" & + cli_pid=$! + + wait_for_it_machine=$server + wait_for_it_file="/tmp/dapltest_ser_run.log" + wait_for_it_string="Exiting" + echo -n "wait for dapltest server to stop..." + wait_for_it + + # Wait for Server and Client to be done + wait $cli_pid + + # clean up + ssh root@$server "rm /tmp/dapltest_ser_run.log" > /dev/null 2>&1 + ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1 + + echo -e "\tdapltest\t\tserver: $server\t\tclient: $client\t\tprovider: $provider\t\tTESTS PASSED" + echo -e "----------------------------------------------------------\n" +} + + +# Run all the test cases between two machines. +function server_host_test(){ + taskset_4_server="" + taskset_4_client="" + if [ $perf_test -eq 1 ]; then + is_mic=`echo $server | grep -c mic` + if [ $is_mic -eq 0 ] && [ "$cpu_mask" != "no_cpu_mask" ]; then + taskset_4_server="taskset $cpu_mask " + fi + is_mic=`echo $client | grep -c mic` + if [ $is_mic -eq 0 ] && [ "$cpu_mask" != "no_cpu_mask" ]; then + taskset_4_client="taskset $cpu_mask " + fi + + echo -e "\n**** dtest: provider: $provider \tserver: $server \tclient: $client ****\n" >> $log_file + if [ $fast_test -eq 0 ]; then + echo -e "\nBytes\t\t Latency\t\t\t MB/s" >> $log_file + fi + fi + + if [ "$dapl_test_user_input" != "o" ]; then + echo -e "\n\n\n\t**** dtest\t\tprovider: $provider\t\tserver: $server $taskset_4_server\t\tclient: $client $taskset_4_client ****" + + #set var value to zero in order to use dtest default value for that option. + for u in $u_options; do + for w in $w_options; do + for b in $b_options; do + for S in $S_options; do + for B in $B_options; do + for D in $D_options; do + for W in $W_options; do # Always keep last. See Note 1. + # Run one test case between Client and Server. + testcase + ret=$? + if [ $ret -ne 0 ]; then + echo TEST FAILED + exit 1 + fi + sleep 1 + done + done + done + done + done + done + done + + echo -e "\n\tdtest\t\tserver: $server\t\tclient: $client\t\tprovider: $provider\t\tTEST PASSED\n\n" + + if [ $perf_test -ne 1 ] && [ $do_not_validate_data_with_scif -eq 0 ] && [ $fast_test -ne 1 ]; then + server_client_data_validation_test + fi + fi + + if [ "$dapl_test_user_input" != "n" ] && [ $good_provider_for_dapltest -eq 1 ] && [ $fast_test -ne 1 ]; then + server_client_dapl_test + fi + +} + +function help(){ + echo -e "\n\tRun dtest and dapltest accross cluster - from each client to each server\n" + echo -e "\t\tServer list: $server_list" + echo -e "\t\tClient list: $client_list\n" + echo -e "\t-P : Provider name or 'ALL' for all prividers (default $def_provider)" + echo -e "\t-f: Fast test" + echo -e "\t-l : How many test loops to run. Def forever" + echo -e "\t-t : How many minutes to run. Def forever" + echo -e "\t-p or \"no_cpu_mask\": Performance test" + echo -e "\t\tMask in 0xHEX format. should match host's /sys/class/mic/mic0/device/local_cpus" + echo -e "\t\tFor no CPU mask enter \"no_cpu_mask\"" + echo -e "\t\tConsider also: taskset mpxyd, set mcm_affinity to 2 in /etc/mpxyd.conf, performance mode at the host scaling_governor" + echo -e "\t\tConsider also to change DAPL MTU (-M optoin)" + echo -e "\t-w: Write only test" + echo -e "\t-u: uni-direction only test" + echo -e "\t-d : dapl test options. \"n\": No dapl tests. \"y\": Run dapl tests. \"o\": Run Only dapl tests (no dtest). Def: Run dapl_test" + echo -e "\t-M : DAPL MTU" + echo -e "\t-b : data size. Can be: one size, many sizes as a string or type \`all\` for all sizes power of 2" + echo -e "\t-U: \"user string\". user dtest option string ( -w -b -u and -S dtest options )" + echo -e "\t-z: use zero for -w -b -u and -S dtest options (zero mean test default value)" + echo -e "\t-i: No inline data test" + echo -e "\t-m: Force MFO test" + echo -e "\t-D: DAPL debug print in log files" + echo -e "\t-v: dtest verbose mode" + echo -e "\t-q: qib test over mlx4 (same as -m and -i options)" + echo -e "\t-V: Print the script version" + echo -e "\t-h: help" + echo -e "\n\tWhile test is running:" + echo -e "\t^c: Exit gracefully" + echo -e "\t^c^c: Forced exit" + echo -e "\ti: Print round number and time duration" + echo -e "\n\n" + exit 1 +} + + +function log(){ + if [ $provider_search_debug -eq 1 ]; then + echo -e "$@" + else + echo -n "." + fi +} + + +function providers_search(){ + echo -e "\nSearching for devices" + first_host=1 + for host in $host_list; do + # make sure host dat file exist + dat_conf_found="NOT found" + ssh root@$host "[ -f $dat_conf ]" && dat_conf_found="dat_conf_found" + if [ "$dat_conf_found" == "dat_conf_found" ]; then + log "$dat_conf found on $host" + else + echo -e "\n\t$dat_conf was not found on $host.\n\n" + exit 1 + fi + + #ib devices list + dev_list=`ssh root@$host ibv_devices | tail -n +3 | awk '{ print $1 }'` + for dev in $dev_list; do + # for each device + log Found $dev device + port_cnt=`ssh root@$host ibv_devinfo -d $dev | grep phys_port_cnt | awk '{print$2 }'` + log " $dev phys_port_cnt: $port_cnt" + for port in $(seq 1 $port_cnt); do + # for each post + log " checking $dev port $port status" + up=`ssh root@$host ibv_devinfo -d $dev -i $port | grep state | grep -c PORT_ACTIVE` + if [ $up -ne 1 ]; then + log " $dev port $i is not active" + continue + fi + log " $dev port $port is active" + log " add it to the list" + # get a list of providers that this device can use + providers+=`ssh root@$host cat $dat_conf | grep "$dev $port" | awk '{ print $1 }'` + providers+=" " + done + done + + #add network ib devices + net_dev_list=` ssh root@$host netstat -i | grep -v "no statistics available" | tail -n +3 | awk '{ print $1 }'` + for net_dev in $net_dev_list; do + # for each net device + log Found $net_dev net device + is_ib=`ssh root@$host ip addr show $net_dev | grep -c infiniband` + if [ $is_ib -ne 1 ]; then + log " $net_dev net device is not ib device" + continue + fi + log " $net_dev is infiniband device" + has_ip_addr=`ssh root@$host ip addr show $net_dev | grep inet | grep -vc inet6` + if [ $has_ip_addr -ne 1 ]; then + log " $net_dev net device has no ip addr" + continue + fi + log " $net_dev net device has IP address" + log " add it to the list" + # get a list of providers that this device can use + providers+=`ssh root@$host cat $dat_conf | grep "$net_dev 0" | awk '{ print $1 }'` + providers+=" " + done + + log; log -n "$host povider list: "; for i in $providers; do log -n "$i "; done; log + if [ $first_host -eq 1 ]; then + # just save providers from first host + hosts_providers_list=$providers + first_host=0 + else + # Merge providers from prev hosts with the one from the new host + # Keep only the providers that are on both lists + log hosts p from prev hosts: $hosts_providers_list + hosts_providers_list+=$providers + hosts_providers_list=`for p in $hosts_providers_list; do echo $p; done | sort | uniq -d` + log hosts p after merge: $hosts_providers_list + fi + providers="" + done + cnt=0 + echo -e "\nPovider list:" + for i in $hosts_providers_list; do + echo $i + let cnt+=1 + done + if [ $cnt -eq 0 ]; then + echo -e "no devices where found\n\n" + exit + fi + echo -e "Total $cnt providers\n\n" +} + + +# check if the "server-client-provider" combination is OK +# Set server_client_provider_is_not_valid_combo to one if not a valid combo +function check_provider_server_client_combo(){ + server_client_provider_is_not_valid_combo=0 + #check the following: + # 1. scif providers can only run on the same machine. + is_scif=`echo $provider | grep -c scif` + if [ $is_scif -eq 1 ]; then + server_host=`echo $server | awk -F "-mic" '{ print $1 }'` + client_host=`echo $client | awk -F "-mic" '{ print $1 }'` + if [ $server_host == $client_host ]; then + return + else + server_client_provider_is_not_valid_combo=1 + return + fi + fi + # 2. MIC qib can only run mcm provider + is_ser_mic=`echo $server | grep -c mic` + is_cli_mic=`echo $client | grep -c mic` + if [ $is_ser_mic -eq 1 ] || [ $is_cli_mic -eq 1 ]; then + # MIC Server or Client + is_qib_provider=`echo $provider | grep -c qib` + if [ $is_qib_provider -eq 1 ]; then + # Server or Client is MIC AND qib provider - make sure provider is MCM + is_mcm=`echo $provider | grep -c m` + if [ $is_mcm -eq 1 ]; then + return + else + server_client_provider_is_not_valid_combo=1 + return + fi + fi + fi + # 3. check if MICs ib interface is UP + is_ib_provider=`echo $provider | grep -ce -ib` + if [ $is_ib_provider -eq 1 ]; then + interface=`echo $provider | awk -F "ofa-v2-" '{ print $2 }'` + if [ $is_ser_mic -eq 1 ]; then + up=`ssh root@$server ifconfig | grep -c $interface` + if [ $up -eq 1 ]; then + return + else + server_client_provider_is_not_valid_combo=1 + return + fi + fi + if [ $is_cli_mic -eq 1 ]; then + up=`ssh root@$client ifconfig | grep -c $interface` + if [ $up -eq 1 ]; then + return + else + server_client_provider_is_not_valid_combo=1 + return + fi + fi + fi +} + + + + + + +while getopts uviVzDmfwhiql:t:P:U:p:d:M:b: option +do + case "${option}" + in + P) user_provider=${OPTARG};; + m) no_inline_data=1 ; mfo_test=1;; + f) fast_test=1; loops=1; fast_test_str=" fast test";; + p) cpu_mask=${OPTARG}; perf_test=1; W_options="0 1";; + U) user_srting=${OPTARG}; b_options="0"; u_options="0"; S_options="0"; w_options="0"; B_options="0";; + z) b_options="0"; u_options="0"; S_options="0"; w_options="0"; B_options="0";; + w) w_options="1";; + u) unidirection_test=1; u_options="1";; + D) debug_info=1;; + d) dapl_test_user_input=${OPTARG};; + v) v_for_test=" -v ";; + i) no_inline_data=1;; + q) no_inline_data=1 ; mfo_test=1;; + t) max_run_time=${OPTARG};; + M) dapl_mtu=${OPTARG};; + l) loops=${OPTARG};; + b) user_b_options=${OPTARG};; + V) echo -e "\n\t${0##*/} version $script_version\n\n"; exit;; + h) help;; + esac +done + +if [ $fast_test -eq 1 ]; then + b_options="0"; u_options="0"; S_options="0"; w_options="0"; B_options="0"; D_options="0"; +fi + +if [ $perf_test -eq 1 ]; then + b_options=$b_options_for_perf_test; u_options="0"; S_options="0"; loops=1; w_options="1"; B_options="0"; user_srting="$user_string -p";dapl_test_user_input="n"; D_options="0"; + legit_input=`echo $cpu_mask | grep -ci 0x` + if [ $legit_input -ne 1 ] && [ "$cpu_mask" != "no_cpu_mask" ]; then + echo -e "\n\t< 0xCPUs_mask > or \"no_cpu_mask\" in option -p is missing - input=$cpu_mask - Exit\n\n" + exit + fi +fi + +if [ $fast_test -eq 1 ] && [ $perf_test -eq 1 ]; then + b_options="0" +fi + +if [ $unidirection_test -eq 1 ]; then + u_options="1" +fi + +if [ "$user_b_options" != "none" ]; then + if [ "$user_b_options" == "all" ]; then + b_options=$b_options_for_perf_test + else + b_options="$user_b_options" + fi +fi + +if [ "$dapl_test_user_input" != "n" ] && [ "$dapl_test_user_input" != "y" ] && [ "$dapl_test_user_input" != "o" ]; then + echo -e "\n\tdapl test option must be n/y/o - Exit\n\n" + exit +fi + +# check mpxyd is running on host machines. +for host in $host_list; do + up=`ssh root@$host "ps ax | grep -c mpxyd"` + if [ $up -ne 3 ]; then + echo -e "\n\tERROR - mpxyd is not running on $host\n\n" + exit + fi + if [ $no_inline_data -eq 1 ]; then + up=`ssh root@$host cat /var/log/mpxyd.log | grep -c "RDMA IB inline threshold 0"` + if [ $up -ne 1 ]; then + echo on host $host you need to run mpxyd with mcm_ib_inline 0 in /etc/mpxyd.conf file for no inline data test + exit 1 + fi + fi +done + +if [ $user_provider == "ALL" ] || [ $user_provider == "all" ]; then + providers_search +else + hosts_providers_list=$user_provider +fi + +echo -e "\nServer list: $server_list" +echo -e "Client list: $client_list" +echo -e "Host list:" +for i in $host_list; do + echo $i +done +echo + +if [ $mfo_test -eq 1 ]; then + export_str="export DAPL_MAX_INLINE=0 ; export DAPL_MCM_MFO=1 ; " + echo -ne "\n\t\t**** Running MFO test case \t\t$export_str ****\n\n" +elif [ $no_inline_data -eq 1 ]; then + export_str="export DAPL_MAX_INLINE=0 ; " + echo -ne "\n\t\t**** Running no inline data test case \t\t$export_str ****\n\n" +else + export_str="" +fi + +if [ $debug_info -eq 1 ]; then + export_str="$export_str export DAPL_DBG_TYPE=0xffffffff ; " + echo -ne "\n\t\t**** Running in debug mode\t\texport value: $export_str ****\n\n" +fi +if [ $dapl_mtu -ne 0 ]; then + export_str="$export_str export DAPL_IB_MTU=$dapl_mtu ; " + echo -ne "\n\t\t**** Setting DAPL_IB_MTU to $dapl_mtu \t\texport value: $export_str ****\n\n" +fi + +if [ $loops -ne 0 ]; then + echo -e "\n\tRunning$fast_test_str for $loops iterations" +fi + +if [ $max_run_time -ne 0 ]; then + echo -e "\n\tRunning$fast_test_str for $max_run_time minutes" +fi + +if [ $loops -eq 0 ] && [ $max_run_time -eq 0 ]; then + echo -ne "\n\tRunning$fast_test_str forever\n\n" +fi + +if [ $perf_test -eq 1 ]; then + if [ $unidirection_test -eq 1 ]; then + log_file+="unidirection_test-" + else + log_file+="bidirection_test-" + fi + log_file+=`date +%F-%H-%M-%S` + echo -e "\n\tRunning performance test with cpu mask: $cpu_mask\n\tOutput file: $log_file" + echo "Server list: $server_list" > $log_file + echo "Client list: $client_list" >> $log_file + echo "CPU mask: $cpu_mask" >> $log_file + if [ $dapl_mtu -ne 0 ]; then + echo "DAPL_IB_MTU: $dapl_mtu" >> $log_file + else + echo "DAPL_IB_MTU: Default value" >> $log_file + fi + if [ $unidirection_test -eq 1 ]; then + echo "Test type: unidirection test" >> $log_file + else + echo "Test type:bidirection test" >> $log_file + fi + for host in $host_list; do + op_poll=`ssh root@$host cat /var/log/mpxyd.log | grep -c "OP thread polling enabled"` + if [ $op_poll -ne 1 ]; then + echo "OP thread polling on $host: disabled" >> $log_file + echo -e "\tOP thread polling on $host: disabled" + else + echo "OP thread polling on $host: enabled" >> $log_file + echo -e "\tOP thread polling on $host: enabled" + fi + done + echo -e "\n\n" +fi +echo "0" > $dog_file +dog & + +sleep 1 +start_time=`date +%s` + +while [ 1 ]; do + now=`date +%s` + run_time=$(($now-$start_time)) + ss=$(($run_time%60)) + mm=$(($run_time/60)) + total_run_time_in_min=$mm + mm=$(($mm%60)) + hh=$(($run_time/3600)) + dd=$(($hh/24)) + hh=$(($hh%24)) + pp=$(printf "%d days %d hours %d min and %d sec" $dd $hh $mm $ss) + + echo + echo + echo "**************************************************************" + echo "**************************************************************" + echo Run time: $pp + if [ $max_run_time -ne 0 ] && [ $total_run_time_in_min -ge $max_run_time ]; then + echo -e "Ran for the $max_run_time minute requested by the user - Exiting\n\n" + break; + fi + if [ $loops -ne 0 ] && [ $runs -eq $loops ]; then + echo -e "Ran for the $loops iterations requested by the user - Exiting\n\n" + break; + fi + runs=$(( $runs + 1 )) + echo Starting round $runs + date + echo "**************************************************************" + echo "**************************************************************" + echo + + # Runinng + for provider in $hosts_providers_list; do + do_not_validate_data_with_scif=`echo $provider | grep -c scif` + good_provider_for_dapltest=1 + for server in $server_list; do + for client in $client_list; do + check_provider_server_client_combo + if [ $server_client_provider_is_not_valid_combo -ne 0 ]; then + #echo -e "***** ***** skipping test case: Server:$server Client:$client provider:$provider ***** *****" + continue + fi + # Run all test cases between Client and Server. + server_host_test + done + done + + if [ "$dapl_test_user_input" == "o" ] && [ $ran_one_dapltest -eq 0 ]; then + echo -e "\n\n\n\n\t\t***** ***** WARNING: only dapltest was set up but no dapltest was done with $provider provider $export_str ***** *****\n\n" + else + echo -e "\n\n\n\n\t\t***** ***** server client tests with $provider provider $export_str - TEST PASSED ***** *****\n\n" + fi + done +done