From e93d1a091952c6fd1ce5f5f4d477d37e5224aaa1 Mon Sep 17 00:00:00 2001 From: FKHals <5229803-FKHals@users.noreply.gitlab.com> Date: Fri, 30 Sep 2022 12:22:06 +0200 Subject: [PATCH] Add VRM Jobid to the send process infos which only works if the "SLURM_VRM_JOBID" environment variable is set. If that is not the case, the message looks just as before. That means that if the env var is set then the locserv (test server) will throw an error since it will not be able to parse the message. Also add a troubleshooting entry in the README. --- README.md | 99 ++++++++++++++++++++--------------- ompi/communicator/comm_init.c | 16 +++++- rank-swapper-agent/locserv.c | 3 ++ 3 files changed, 74 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 712582fba2..1125cf3796 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,20 @@ make all ``` Now run the two scripts first `./server.sh` and then `./client.sh` in two separate terminal windows to observe that the clients (the MPI-processes) inform the server of their identity an then the server answers with modified ranks which the client then applies, completing it's initialization. +## Troubleshooting + +### MPI not found + +If running `./client.sh` causes the following error: +```shell +./client.sh: Zeile 21: mpirun: Command not found. +``` +Then the problem is caused by MPI not being found. +In this case the `OMPI` and `PATH` variables are not properly set. +They need to point to the Open MPI installation directory (as set in the `--prefix` of the `configure` call at the installation). + +--- + **Now follows the actual/default Open MPI README:** # Open MPI @@ -94,48 +108,49 @@ $ ../configure --prefix=<path> |& tee config.out The rest of this file contains: -* [General release notes about Open MPI](#general-notes) - * [Platform-specific notes](#platform-notes) - * [Compiler-specific notes](#compiler-notes) - * [Run-time support notes](#general-run-time-support-notes) - * [MPI functionality and features](#mpi-functionality-and-features) - * [OpenSHMEM functionality and - features](#openshmem-functionality-and-features) - * [MPI collectives](#mpi-collectives) - * [OpenSHMEM collectives](#openshmem-collectives) - * [Network support](#network-support) - * [Open MPI extensions](#open-mpi-extensions) -* [Detailed information on building Open MPI](#building-open-mpi) - * [Installation options](#installation-options) - * [Networking support and options](#networking-support--options) - * [Run-time system support and options](#run-time-system-support) - * [Miscellaneous support - libraries](#miscellaneous-support-libraries) - * [MPI functionality options](#mpi-functionality) - * [OpenSHMEM functionality options](#openshmem-functionality) - * [Miscellaneous functionality - options](#miscellaneous-functionality) -* [Open MPI version and library numbering - policies](#open-mpi-version-numbers-and-binary-compatibility) - * [Backwards compatibility polices](#backwards-compatibility) - * [Software version numbering](#software-version-number) - * [Shared library version numbering](#shared-library-version-number) -* [Information on how to both query and validate your Open MPI - installation](#checking-your-open-mpi-installation) -* [Description of Open MPI extensions](#open-mpi-api-extensions) - * [Compiling the extensions](#compiling-the-extensions) - * [Using the extensions](#using-the-extensions) -* [Examples showing how to compile Open MPI applications](#compiling-open-mpi-applications) -* [Examples showing how to run Open MPI applications](#running-open-mpi-applications) -* [Summary information on the various plugin - frameworks](#the-modular-component-architecture-mca) - * [MPI layer frameworks](#mpi-layer-frameworks) - * [OpenSHMEM component frameworks](#openshmem-component-frameworks) - * [Run-time environment - frameworks](#back-end-run-time-environment-rte-component-frameworks) - * [Miscellaneous frameworks](#miscellaneous-frameworks) - * [Other notes about frameworks](#framework-notes) -* [How to get more help](#questions--problems) +- [Custom Open MPI](#custom-open-mpi) + - [Building](#building) + - [Usage](#usage) + - [Troubleshooting](#troubleshooting) + - [MPI not found](#mpi-not-found) +- [Open MPI](#open-mpi) + - [Quick start](#quick-start) + - [Table of contents](#table-of-contents) + - [General notes](#general-notes) + - [Platform Notes](#platform-notes) + - [Compiler Notes](#compiler-notes) + - [General Run-Time Support Notes](#general-run-time-support-notes) + - [MPI Functionality and Features](#mpi-functionality-and-features) + - [OpenSHMEM Functionality and Features](#openshmem-functionality-and-features) + - [MPI Collectives](#mpi-collectives) + - [OpenSHMEM Collectives](#openshmem-collectives) + - [Network Support](#network-support) + - [Open MPI Extensions](#open-mpi-extensions) + - [Building Open MPI](#building-open-mpi) + - [Installation Options](#installation-options) + - [Networking support / options](#networking-support--options) + - [Run-time system support](#run-time-system-support) + - [Miscellaneous support libraries](#miscellaneous-support-libraries) + - [MPI Functionality](#mpi-functionality) + - [OpenSHMEM Functionality](#openshmem-functionality) + - [Miscellaneous Functionality](#miscellaneous-functionality) + - [Open MPI Version Numbers and Binary Compatibility](#open-mpi-version-numbers-and-binary-compatibility) + - [Backwards Compatibility](#backwards-compatibility) + - [Software Version Number](#software-version-number) + - [Shared Library Version Number](#shared-library-version-number) + - [Checking Your Open MPI Installation](#checking-your-open-mpi-installation) + - [Open MPI API Extensions](#open-mpi-api-extensions) + - [Compiling the extensions](#compiling-the-extensions) + - [Using the extensions](#using-the-extensions) + - [Compiling Open MPI Applications](#compiling-open-mpi-applications) + - [Running Open MPI Applications](#running-open-mpi-applications) + - [The Modular Component Architecture (MCA)](#the-modular-component-architecture-mca) + - [MPI layer frameworks](#mpi-layer-frameworks) + - [OpenSHMEM component frameworks](#openshmem-component-frameworks) + - [Back-end run-time environment (RTE) component frameworks:](#back-end-run-time-environment-rte-component-frameworks) + - [Miscellaneous frameworks:](#miscellaneous-frameworks) + - [Framework notes](#framework-notes) + - [Questions? Problems?](#questions--problems) Also, note that much, much more information is also available [in the Open MPI FAQ](https://www.open-mpi.org/faq/). diff --git a/ompi/communicator/comm_init.c b/ompi/communicator/comm_init.c index 9af7d941bc..23fe6646e1 100644 --- a/ompi/communicator/comm_init.c +++ b/ompi/communicator/comm_init.c @@ -35,6 +35,7 @@ #include <stdio.h> #include <stdlib.h> #include <sys/socket.h> +#include <sys/types.h> #include <sys/un.h> #include <unistd.h> #include <string.h> @@ -57,6 +58,8 @@ #define FD_STDIN 0 #define BUFFLEN 128 +#define JOBID_ENV_VAR "SLURM_VRM_JOBID" + /* ** Table for Fortran <-> C communicator handle conversion ** Also used by P2P code to lookup communicator based @@ -141,11 +144,20 @@ static int get_modified_ranks(uint32_t jobid, uint32_t vpid, size_t size, opal_v pid_t pid = getpid(); + const char * vrm_jobid = getenv(JOBID_ENV_VAR); + char vrm_jobid_with_leading_comma[sizeof(uint64_t) + 1] = ""; + if (NULL != vrm_jobid) { + char comma = ','; + strncat(vrm_jobid_with_leading_comma, &comma, 1); + strcat(vrm_jobid_with_leading_comma, vrm_jobid); + printf("TEST: %s", vrm_jobid_with_leading_comma); + } + char info_to_send[BUFFLEN]; memset(info_to_send, 0, BUFFLEN); snprintf(info_to_send, BUFFLEN, - "{\"msg_type\": 128, \"msg_data\": \"%d,%u,%u,%zu\"}", - pid, vpid, jobid, size); + "{\"msg_type\": 128, \"msg_data\": \"%d,%u,%u,%zu%s\"}", + pid, vpid, jobid, size, vrm_jobid_with_leading_comma); // TODO: endianness uint32_t msg_length = strlen(info_to_send) + 1; diff --git a/rank-swapper-agent/locserv.c b/rank-swapper-agent/locserv.c index 37ae142b36..7c11f2867d 100644 --- a/rank-swapper-agent/locserv.c +++ b/rank-swapper-agent/locserv.c @@ -84,6 +84,9 @@ int main(void) { uint32_t jobid = 0; size_t size = 0; int vars_read = sscanf(client_message, "{\"msg_type\": 128, \"msg_data\": \"%d,%u,%u,%zu\"}", &pid, &vpid, &jobid, &size); + if (4 != vars_read) { + errorExit("Message could not be parsed: Too many/few entries in msg_data!"); + } printf("Sscanf read count: %d (should equal 4)\n", vars_read); printf("Spawned - PID: %d, vpid: %u, jobID: %u, size: %zu\n", pid, vpid, jobid, size); -- GitLab