diff options
Diffstat (limited to 'sbin/mom_gencfg')
-rwxr-xr-x | sbin/mom_gencfg | 559 |
1 files changed, 559 insertions, 0 deletions
diff --git a/sbin/mom_gencfg b/sbin/mom_gencfg new file mode 100755 index 0000000..f676b59 --- /dev/null +++ b/sbin/mom_gencfg @@ -0,0 +1,559 @@ +#!/usr/bin/perl +# ***************************************************************************** +# +# Copyright 2011 Zuse Institute Berlin +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Please send comments to kallies@zib.de +# +# ***************************************************************************** +# Purpose: - called from /etc/init.d/pbs_mom during start actions. +# - creates /var/spool/torque/mom_priv/mom.layout +# - creates/modifies /dev/cpuset/torque +# Prereq: - hwloc >= 1.1, http://www.open-mpi.org/projects/hwloc/ +# - Sys::Hwloc >= 0.09, http://search.cpan.org/~bka/ +# Install: Install this script on each UV rack +# /opt/torque/Scripts/mom_gencfg root:root -rwxr-xr-x +# Config: Set MOM_GENCFG=/opt/torque/Scripts/mom_gencfg +# in /etc/init.d/pbs_mom for UV, execute $MOM_GENCFG before +# starting the pbs_mom daemon. +# MOM_GENCFG can be overridden in /etc/sysconfig/pbs_mom. +# ***************************************************************************** +# $Id: mom_gencfg,v 1.1.2.1 2011/01/17 10:12:46 acountin Exp $ +# ***************************************************************************** + +# +# *** Instructions for use *** +# +# 1. Install hwloc - see contrib/hwloc_install.sh. This should already be done since +# TORQUE needs hwloc for its cpuset implementation starting in 4.0 +# 2. Install Sys::Hwloc from CPAN +# 3. Set $PBS_HOME to the proper value if not already set +# 4. Update the variables in the section 'Config Definitions' Especially update firstNodeId +# and nodesPerBoard if desired. +# firstNodeId should be set above 0 if you have a root cpuset that you wish to exclude +# nodesPerBoard is the number of numa nodes per board. Each node is defined in the +# directory /sys/devices/system/node, in a subdirectory node<node index> +# 5. Backup your current file, just in case a variable is set incorrectly or neglected +# 6. Run this script and enjoy the layout file +# +# + + +use strict; + +use lib qw( + /usr/lib/perl5 + /usr/lib/perl5/site_perl + ); + +use Sys::Hostname; +use File::Basename; +use Getopt::Long qw(:config no_ignore_case); +use autouse 'Pod::Usage' => qw(pod2usage); +use Sys::Hwloc 0.09; + +my $progName = basename($0); +my $hostName = hostname(); + +$SIG{__DIE__} = \&xDie; + +# ============================================================================== +# Setup needed before init +# ============================================================================== + +BEGIN: { + die "This script needs at least hwloc-1.1\n" unless HWLOC_XSAPI_VERSION() >= 0x00010100; +} + +# ============================================================================== +# Config definitions +# ============================================================================== + +my $hostNames = undef; # hostname pattern to be run on, undef to skip test +my $cpusetFsName = '/dev/cpuset'; # the name of the cpuset file system +my $cpusetBaseName = '/torque'; # the name of the parent cpuset of a job's cpuset +my $mkdirCmd = '/bin/mkdir'; # the path to the mkdir command +my $catCmd = '/bin/cat'; # the path to the cat command +my $echoCmd = '/bin/echo'; # the path to the echo command +my $momCfgDir = 'mom_priv'; # the directory where MOM configs are stored +my $momLayoutFile = 'mom.layout'; # the name of the MOM layout file +my $firstNodeId = 0; # ID of 1st NUMA node to be used by Torque (start with 0) +my $lastNodeId = undef; # ID of last NUMA node to be used (undef means last available) +my $nodesPerBoard = 1; # number of NUMA nodes per nodeboard +my %cpusetConf = ( + cpus => undef, # undef means auto-generate + mems => undef, # undef means auto-generate + cpu_exclusive => 1, # + mem_exclusive => 1, # + ); +my %options = ( + -doLayout => 1, # generate mom.layout + -withCpus => 1, # include cpus in mom.layout + -withMems => 1, # include mems in mom.layout + -doCpuset => 1, # generate/modify /torque cpuset + -withSmt => 1, # include logical processors running on the same core + -verbose => undef, # be verbose to STDERR + -dryRun => undef, # no actions, just tell what would be done + ); + +# ============================================================================== +# Command line options +# ============================================================================== + +GetOptions( + "layout!" => \$options{-doLayout}, + "cpus!" => \$options{-withCpus}, + "mems!" => \$options{-withMems}, + "smt!" => \$options{-withSmt}, + "cpuset!" => \$options{-doCpuset}, + "dry-run!" => \$options{-dryRun}, + "verbose!" => \$options{-verbose}, + "help|?" => sub { usage(0) }, + "man" => sub { manPage() }, + ) or usage(2); + +if($options{-dryRun}) { + $options{-verbose} = 1 unless defined $options{-verbose}; + xDebug(">>> DryRunDryRunDryRunDryRunDryRun <<<"); +} + +# ============================================================================== +# Quick exit if not wanted on this host, or if no work to do +# ============================================================================== + +#if(defined $hostNames) { +# unless($hostName =~ /$hostNames/) { +# xDebug("--- Don't run on $hostName ---"); +# exit 0; +# } +#} + +exit 0 unless ($options{-doLayout} || $options{-doCpuset}); + +# ============================================================================== +# See if PBS_HOME is set, and if $PBS_HOME/mom_priv exists. +# If not, we are probably not called correctly, thus die. +# See if cpusets are configured. If not, die. +# ============================================================================== + +die "\$PBS_HOME not set\n" unless (exists $ENV{PBS_HOME} && $ENV{PBS_HOME}); +die "PBS_HOME=$ENV{PBS_HOME} does not exist\n" unless -d $ENV{PBS_HOME}; +$momCfgDir = "$ENV{PBS_HOME}/${momCfgDir}"; +die "MOM config dir $momCfgDir does not exist\n" unless -d $momCfgDir; +$momLayoutFile = "${momCfgDir}/${momLayoutFile}"; +die "this system does not support cpusets\n" unless -d $cpusetFsName; + +# ============================================================================== +# Figure out system topology, collect wanted node objects +# ============================================================================== + +my $topology = Sys::Hwloc::Topology->init; +die "Failed to init topology\n" unless defined $topology; +$topology->set_flags(HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM); +die("Failed to load topology\n") if $topology->load; + +# ============================================================================== +# Collect nodesets of wanted NUMA nodes per nodeBoard +# ============================================================================== + +my @nodeBoards = (); +my $nodeObj = undef; +my $nNodes = 0; +while($nodeObj = $topology->get_next_obj_by_type(HWLOC_OBJ_NODE, $nodeObj)) { + my $nodeId = $nodeObj->logical_index; + next if $nodeId < $firstNodeId; + last if (defined $lastNodeId && $nodeId > $lastNodeId); + if($nNodes) { + $nodeBoards[$#nodeBoards]->{nodeset}->or($nodeObj->nodeset); + } else { + push @nodeBoards, { + cpuset => Sys::Hwloc::Bitmap->new, + nodeset => $nodeObj->nodeset->dup, + }; + } + $nNodes++; + $nNodes = 0 if $nNodes >= $nodesPerBoard; +} + +# ============================================================================== +# Assemble cpusets per nodeBoard +# ============================================================================== + +foreach my $nodeBoard (@nodeBoards) { + $topology->cpuset_from_nodeset_strict($nodeBoard->{cpuset}, $nodeBoard->{nodeset}); + next if $options{-withSmt}; + my $core = undef; + while($core = $topology->get_next_obj_inside_cpuset_by_type($nodeBoard->{cpuset}, HWLOC_OBJ_CORE, $core)) { + my $j = 1; + while (my $pu = $topology->get_obj_inside_cpuset_by_type($core->cpuset, HWLOC_OBJ_PU, $j++)) { + $nodeBoard->{cpuset}->andnot($pu->cpuset); + } + } +} + +# ============================================================================== +# Generate mom.layout +# ============================================================================== + +if($options{-doLayout}) { + + xDebug("--- Generating $momLayoutFile ---"); + if(! $options{-dryRun}) { + open(FILE, "> $momLayoutFile") or die "failed to open $momLayoutFile: $!\n"; + } + foreach my $nodeBoard (@nodeBoards) { + my $line = sprintf("nodes=%s", $nodeBoard->{nodeset}->sprintf_list); + $line .= sprintf(" cpus=%s", $nodeBoard->{cpuset}->sprintf_list) if $options{-withCpus}; + $line .= sprintf(" mems=%s", $nodeBoard->{nodeset}->sprintf_list) if $options{-withMems}; + xDebug(" $line"); + print FILE "$line\n" unless $options{-dryRun}; + } + close(FILE) unless $options{-dryRun}; + +} + +# ============================================================================== +# Create/modify torque cpuset +# ============================================================================== + +if($options{-doCpuset}) { + + # Create it if it is not there + my $cpusetPath = "${cpusetFsName}${cpusetBaseName}"; + if(! -d $cpusetPath) { + xDebug("--- Creating $cpusetPath ---"); + my $rc = execCmd($mkdirCmd,1,$cpusetPath); + die "Failed to create $cpusetPath\n" unless defined $rc; + } + + # Read content + xDebug("--- Reading $cpusetPath ---"); + my $cpusetData = readCpuset($cpusetPath); + die "Failed to read $cpusetPath\n" unless defined $cpusetData; + + # Assemble changes + my %cpusetMod = (); + foreach my $key (keys %cpusetConf) { + next unless exists $cpusetData->{$key}; + my $val = $cpusetConf{$key}; + CASE: { + $key eq 'cpus' && do { + if(! defined $val) { + my $cpuset = Sys::Hwloc::Bitmap->new; + foreach my $nodeBoard (@nodeBoards) { + $cpuset->or($nodeBoard->{cpuset}); + } + $val = $cpuset->sprintf_list; + $cpuset->free; + } + last CASE; + }; + $key eq 'mems' && do { + if(! defined $val) { + my $nodeset = Sys::Hwloc::Bitmap->new; + foreach my $nodeBoard (@nodeBoards) { + $nodeset->or($nodeBoard->{nodeset}); + } + $val = $nodeset->sprintf_list; + $nodeset->free; + } + last CASE; + }; + } + next unless defined $val; + if( + (! defined $cpusetData->{$key}) || + (defined $cpusetData->{$key} && $cpusetData->{$key} ne $val) + ) { + $cpusetMod{$key} = $val; + } + } + + # Write changes, if any. Don't abort on error, but warn if changes not done + if(%cpusetMod) { + xDebug("--- Modifying $cpusetPath ---"); + if($options{-dryRun}) { + while(my ($key, $val) = each %cpusetMod) { + xDebug(sprintf(" = cpuset %s: %-25s %s", $cpusetPath, $key, $val)); + } + } else { + while(my ($key, $val) = each %cpusetMod) { + my $out = execCmd($echoCmd, 0, "$val > ${cpusetPath}/$key"); + } + if($options{-verbose}) { + $cpusetData = readCpuset($cpusetPath); + die "Failed to read $cpusetPath\n" unless defined $cpusetData; + while(my ($key, $val) = each %cpusetMod) { + xDebug(sprintf(" %s cpuset %s: %-25s %s", $val eq $cpusetData->{$key} ? '=' : '-', $cpusetPath, $key, $val)); + } + } + } + } +} + +# ============================================================================== +# All done +# ============================================================================== + +$topology->destroy; + +exit 0; + +# ############################################################################# + +# ============================================================================== +# Read cpuset data into a hash, return 0 on error, 1 on success +# ============================================================================== + +sub readCpuset { + my $cpusetPath = shift; + my $cpusetData = {}; + + # Check if cpuset exists + unless(-d $cpusetPath) { + xDebug("ERROR: Cpuset $cpusetPath does not exist."); + return undef; + } + + # Read content of cpuset + foreach my $key (qw( + cpu_exclusive + cpus + mem_exclusive + mem_hardwall + memory_migrate + memory_pressure + memory_spread_page + memory_spread_slab + mems + notify_on_release + sched_load_balance + sched_relax_domain_level + )) { + my $f = "${cpusetPath}/$key"; + next unless -e $f; + my $rc = execCmd($catCmd,0,$f); + return undef unless defined $rc; # Command failed + my $val = undef; + if(@{$rc}) { + CASE: { + $key eq 'tasks' && do { $val = join(",", @{$rc}); last CASE }; + $val = $rc->[0]; + } + } + xDebug(sprintf(" cpuset %s: %-25s %s", $cpusetPath, $key, defined $val ? $val : "NO DATA")); + $cpusetData->{$key} = $val; + } + + return $cpusetData; + +} + +# ============================================================================== +# Execute a command with args. +# Returns arrayref with chomped output on success. +# On command failure, print error msg and return undef. +# ============================================================================== + +sub execCmd { + my $cmdBase = shift; + my $verbose = shift; + my @cmdArgs = @_; + + if(! $cmdBase) { + xDebug("ERROR execCmd: need \$cmdBase."); + return undef; + } + + # -- + # Check if cmdBase is executable + # -- + + if(! -x $cmdBase) { + xDebug("ERROR: File \"$cmdBase\" does not exist or is not executable."); + return undef; + } + + # -- + # Execute + # -- + + my $cmd = $cmdBase; + $cmd .= (" " . join(" ", @cmdArgs)) if @cmdArgs; + xDebug(" About to execute \"$cmd\"") if $verbose; + open(CMD, "$cmd 2>&1 |") or do { + xDebug("ERROR: Failed to execute \"$cmd\": $!"); + return undef; + }; + + my @cmdOut = (<CMD>); + chomp @cmdOut; + + close(CMD); + my $rc = $? >> 8; + if($rc) { + xDebug("ERROR: Command \"$cmd\" returned rc = $rc"); + if(@cmdOut) { + xDebug(join("\n", map { " $_" } grep { /\S/ } $#cmdOut < 3 ? @cmdOut : (@cmdOut[0..2], "..."))); + } + return undef; + } + + # -- + # Return output + # -- + + return \@cmdOut; + +} + +# ============================================================================== +# Usage message +# ============================================================================== + +sub usage { + my $code = shift || 0; + pod2usage( + -verbose => 0, + -exitval => "NOEXIT", + ); + exit $code; +} + +# ============================================================================== +# Man page +# ============================================================================== + +sub manPage { + if ($< == 0) { # Cannot invoke perldoc as root + my $id = eval { getpwnam("nobody") }; + $id = eval { getpwnam("nouser") } unless defined $id; + $id = -2 unless defined $id; + $< = $id; + } + $> = $<; # Disengage setuid + $ENV{PATH} = "/bin:/usr/bin"; # Untaint PATH + delete @ENV{ 'IFS', 'CDPATH', 'ENV', 'BASH_ENV' }; + if ($0 =~ /^([-\/\w\.]+)$/) { + $0 = $1; # Untaint $0 + } else { + die "Illegal characters were found in \$0 ($0)\n"; + } + pod2usage( + -verbose => 2, + -exitval => 0, + ); +} + +# ============================================================================== +# Verbose printing +# ============================================================================== + +sub xDebug { + return unless $options{-verbose}; + my $msg = join("", @_); + if($msg) { + foreach(split("\n", $msg)) { + print STDERR "$progName - $_\n" + } + } else { + print STDERR "$progName - something to debug\n"; + } +} + +sub xDie { + die "$progName - ", @_; +} + +__END__ + +=head1 NAME + +mom_gencfg - Create mom.layout and /dev/cpuset/torque, designed to be called from /etc/init.d/pbs_mom + +=head1 SYNOPSIS + +mom_gencfg --help|-?|--man + +mom_gencfg -(no)layout -(no)cpus -(no)mems -(no)cpuset -(no)smt -(no)dry-run -(no)verbose + +=head1 DESCRIPTION + +This script creates /var/spool/torque/mom_priv/mom.layout and creates/modifies /dev/cpuset/torque +for a pbs_mom that is compiled with --enable-numa-support. + +The basic configuration like number and offset of NUMA node IDs per nodeboard, +cpuset settings, and defaults of command line options is hardcoded in the script. + +The script checks if I<PBS_HOME> is set in the environment. Usually this should point to +/var/spool/torque. + +=head1 OPTIONS + +=over 4 + +=item B<-(no)layout> + +Create the mom.layout file or not. + +=item B<-(no)cpus> + +mom.layout contains cpu IDs per nodeboard or not. + +=item B<-(no)mems> + +mom.layout contains memory node IDs per nodeboard or not. + +=item B<-(no)cpuset> + +Create/modify /dev/cpuset/torque or not. + +=item B<-(no)smt> + +The I<cpus> entry in mom.layout and in /dev/cpuset/torque contain additional +logical processors running on the same core or not. + +=item B<-(no)dry-run> + +If B<-dry-run> is given, show what would have been done. Switches B<-verbose> on, unless B<-noverbose> was given. + +=item B<-(no)verbose> + +Verbose printing to STDERR. + +=item B<-man> + +Prints this man page. + +=item B<-help|-?> + +Prints synopsis. + +=back + +=head1 AUTHOR + +Bernd Kallies, E<lt>kallies@zib.deE<gt> + +=head1 COPYRIGHT AND LICENSE + +Copyright (C) 2011 Zuse Institute Berlin + +This library is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation. + +=cut |