diff --git a/README.md b/README.md index 712582fba20465c30d54b280cde9a94ad834d7f7..1125cf37963154650ce10e1d7c2840f690139404 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,20 @@ make all ``` Now run the two scripts first `./server.sh` and then `./client.sh` in two separate terminal windows to observe that the clients (the MPI-processes) inform the server of their identity an then the server answers with modified ranks which the client then applies, completing it's initialization. +## Troubleshooting + +### MPI not found + +If running `./client.sh` causes the following error: +```shell +./client.sh: Zeile 21: mpirun: Command not found. +``` +Then the problem is caused by MPI not being found. +In this case the `OMPI` and `PATH` variables are not properly set. +They need to point to the Open MPI installation directory (as set in the `--prefix` of the `configure` call at the installation). + +--- + **Now follows the actual/default Open MPI README:** # Open MPI @@ -94,48 +108,49 @@ $ ../configure --prefix=<path> |& tee config.out The rest of this file contains: -* [General release notes about Open MPI](#general-notes) - * [Platform-specific notes](#platform-notes) - * [Compiler-specific notes](#compiler-notes) - * [Run-time support notes](#general-run-time-support-notes) - * [MPI functionality and features](#mpi-functionality-and-features) - * [OpenSHMEM functionality and - features](#openshmem-functionality-and-features) - * [MPI collectives](#mpi-collectives) - * [OpenSHMEM collectives](#openshmem-collectives) - * [Network support](#network-support) - * [Open MPI extensions](#open-mpi-extensions) -* [Detailed information on building Open MPI](#building-open-mpi) - * [Installation options](#installation-options) - * [Networking support and options](#networking-support--options) - * [Run-time system support and options](#run-time-system-support) - * [Miscellaneous support - libraries](#miscellaneous-support-libraries) - * [MPI functionality options](#mpi-functionality) - * [OpenSHMEM functionality options](#openshmem-functionality) - * [Miscellaneous functionality - options](#miscellaneous-functionality) -* [Open MPI version and library numbering - policies](#open-mpi-version-numbers-and-binary-compatibility) - * [Backwards compatibility polices](#backwards-compatibility) - * [Software version numbering](#software-version-number) - * [Shared library version numbering](#shared-library-version-number) -* [Information on how to both query and validate your Open MPI - installation](#checking-your-open-mpi-installation) -* [Description of Open MPI extensions](#open-mpi-api-extensions) - * [Compiling the extensions](#compiling-the-extensions) - * [Using the extensions](#using-the-extensions) -* [Examples showing how to compile Open MPI applications](#compiling-open-mpi-applications) -* [Examples showing how to run Open MPI applications](#running-open-mpi-applications) -* [Summary information on the various plugin - frameworks](#the-modular-component-architecture-mca) - * [MPI layer frameworks](#mpi-layer-frameworks) - * [OpenSHMEM component frameworks](#openshmem-component-frameworks) - * [Run-time environment - frameworks](#back-end-run-time-environment-rte-component-frameworks) - * [Miscellaneous frameworks](#miscellaneous-frameworks) - * [Other notes about frameworks](#framework-notes) -* [How to get more help](#questions--problems) +- [Custom Open MPI](#custom-open-mpi) + - [Building](#building) + - [Usage](#usage) + - [Troubleshooting](#troubleshooting) + - [MPI not found](#mpi-not-found) +- [Open MPI](#open-mpi) + - [Quick start](#quick-start) + - [Table of contents](#table-of-contents) + - [General notes](#general-notes) + - [Platform Notes](#platform-notes) + - [Compiler Notes](#compiler-notes) + - [General Run-Time Support Notes](#general-run-time-support-notes) + - [MPI Functionality and Features](#mpi-functionality-and-features) + - [OpenSHMEM Functionality and Features](#openshmem-functionality-and-features) + - [MPI Collectives](#mpi-collectives) + - [OpenSHMEM Collectives](#openshmem-collectives) + - [Network Support](#network-support) + - [Open MPI Extensions](#open-mpi-extensions) + - [Building Open MPI](#building-open-mpi) + - [Installation Options](#installation-options) + - [Networking support / options](#networking-support--options) + - [Run-time system support](#run-time-system-support) + - [Miscellaneous support libraries](#miscellaneous-support-libraries) + - [MPI Functionality](#mpi-functionality) + - [OpenSHMEM Functionality](#openshmem-functionality) + - [Miscellaneous Functionality](#miscellaneous-functionality) + - [Open MPI Version Numbers and Binary Compatibility](#open-mpi-version-numbers-and-binary-compatibility) + - [Backwards Compatibility](#backwards-compatibility) + - [Software Version Number](#software-version-number) + - [Shared Library Version Number](#shared-library-version-number) + - [Checking Your Open MPI Installation](#checking-your-open-mpi-installation) + - [Open MPI API Extensions](#open-mpi-api-extensions) + - [Compiling the extensions](#compiling-the-extensions) + - [Using the extensions](#using-the-extensions) + - [Compiling Open MPI Applications](#compiling-open-mpi-applications) + - [Running Open MPI Applications](#running-open-mpi-applications) + - [The Modular Component Architecture (MCA)](#the-modular-component-architecture-mca) + - [MPI layer frameworks](#mpi-layer-frameworks) + - [OpenSHMEM component frameworks](#openshmem-component-frameworks) + - [Back-end run-time environment (RTE) component frameworks:](#back-end-run-time-environment-rte-component-frameworks) + - [Miscellaneous frameworks:](#miscellaneous-frameworks) + - [Framework notes](#framework-notes) + - [Questions? Problems?](#questions--problems) Also, note that much, much more information is also available [in the Open MPI FAQ](https://www.open-mpi.org/faq/). diff --git a/ompi/communicator/comm_init.c b/ompi/communicator/comm_init.c index 9af7d941bcebfb85ee096e849fe907ab80c3ec8b..23fe6646e16ad55960efc7a8df804c3003735def 100644 --- a/ompi/communicator/comm_init.c +++ b/ompi/communicator/comm_init.c @@ -35,6 +35,7 @@ #include <stdio.h> #include <stdlib.h> #include <sys/socket.h> +#include <sys/types.h> #include <sys/un.h> #include <unistd.h> #include <string.h> @@ -57,6 +58,8 @@ #define FD_STDIN 0 #define BUFFLEN 128 +#define JOBID_ENV_VAR "SLURM_VRM_JOBID" + /* ** Table for Fortran <-> C communicator handle conversion ** Also used by P2P code to lookup communicator based @@ -141,11 +144,20 @@ static int get_modified_ranks(uint32_t jobid, uint32_t vpid, size_t size, opal_v pid_t pid = getpid(); + const char * vrm_jobid = getenv(JOBID_ENV_VAR); + char vrm_jobid_with_leading_comma[sizeof(uint64_t) + 1] = ""; + if (NULL != vrm_jobid) { + char comma = ','; + strncat(vrm_jobid_with_leading_comma, &comma, 1); + strcat(vrm_jobid_with_leading_comma, vrm_jobid); + printf("TEST: %s", vrm_jobid_with_leading_comma); + } + char info_to_send[BUFFLEN]; memset(info_to_send, 0, BUFFLEN); snprintf(info_to_send, BUFFLEN, - "{\"msg_type\": 128, \"msg_data\": \"%d,%u,%u,%zu\"}", - pid, vpid, jobid, size); + "{\"msg_type\": 128, \"msg_data\": \"%d,%u,%u,%zu%s\"}", + pid, vpid, jobid, size, vrm_jobid_with_leading_comma); // TODO: endianness uint32_t msg_length = strlen(info_to_send) + 1; diff --git a/rank-swapper-agent/locserv.c b/rank-swapper-agent/locserv.c index 37ae142b36d4ce880b01044dddc0b76580d7bbc2..7c11f2867d12a94678db33eb4a38fe6fdf4af5cd 100644 --- a/rank-swapper-agent/locserv.c +++ b/rank-swapper-agent/locserv.c @@ -84,6 +84,9 @@ int main(void) { uint32_t jobid = 0; size_t size = 0; int vars_read = sscanf(client_message, "{\"msg_type\": 128, \"msg_data\": \"%d,%u,%u,%zu\"}", &pid, &vpid, &jobid, &size); + if (4 != vars_read) { + errorExit("Message could not be parsed: Too many/few entries in msg_data!"); + } printf("Sscanf read count: %d (should equal 4)\n", vars_read); printf("Spawned - PID: %d, vpid: %u, jobID: %u, size: %zu\n", pid, vpid, jobid, size);