! $Id$ ! ! Earth System Modeling Framework ! Copyright (c) 2002-2023, University Corporation for Atmospheric Research, ! Massachusetts Institute of Technology, Geophysical Fluid Dynamics ! Laboratory, University of Michigan, National Centers for Environmental ! Prediction, Los Alamos National Laboratory, Argonne National Laboratory, ! NASA Goddard Space Flight Center. ! Licensed under the University of Illinois-NCSA License. ! !============================================================================== !============================================================================== !ESMF_MULTI_PROC_EXAMPLE String used by test script to count examples. !============================================================================== program ESMF_RHandleBitForBitEx #include "ESMF.h" use ESMF use ESMF_TestMod implicit none ! local variables integer :: rc, i, iounit, iostat integer :: petCount, localPet type(ESMF_VM) :: vm type(ESMF_DistGrid) :: distgrid type(ESMF_Array) :: srcArray, dstArray integer, allocatable :: indexList(:) real(ESMF_KIND_R4), pointer:: farrayPtr(:) real(ESMF_KIND_R4) :: sumA, sumB, sumC, sumD, sumE, sumCompare integer :: smmElementCount integer, allocatable :: factorIndexList(:,:) real(ESMF_KIND_R4), allocatable:: factorList(:) type(ESMF_RouteHandle):: rh integer :: srcTermProcessing, pipelineDepth character(len=128) :: msg ! result code integer :: finalrc, result character(ESMF_MAXSTR) :: testname character(ESMF_MAXSTR) :: failMsg finalrc = ESMF_SUCCESS !------------------------------------------------------------------------- !------------------------------------------------------------------------- write(failMsg, *) "Example failure" write(testname, *) "Example ESMF_RHandleBitForBitEx" !------------------------------------------------------------------------- !------------------------------------------------------------------------- call ESMF_Initialize(vm=vm, defaultlogfilename="RHandleBitForBitEx.Log", & logkindflag=ESMF_LOGKIND_MULTI, rc=rc) if (rc /= ESMF_SUCCESS) call ESMF_Finalize(endflag=ESMF_END_ABORT) call ESMF_VMGet(vm, petCount=petCount, localPet=localPet, rc=rc) if (rc /= ESMF_SUCCESS) call ESMF_Finalize(endflag=ESMF_END_ABORT) if (petCount /= 4) then finalrc = ESMF_FAILURE goto 10 endif !BOE ! \subsubsection{Bit-for-bit reproducibility} ! \label{RH:bfb} ! ! Bit-for-bit (bfb) reproducibility is at the core of the regression testing ! schemes of many scientific model codes. The bfb requirement makes it possible ! to easily compare the numerical results of simulation runs using standard ! binary diff tools. ! ! While bfb reproducibility is desirable (and often required) for regression ! testing, it does limit the available performance optimization ! opportunities. Especially in highly parallelized code, best performance is ! often achieved by allowing operations to occur in a flexible order. Under ! some conditions, however, a change in the order of numerical operations ! leads to small numerical differences in the results, breaking bfb ! reproducibility. ! ! ESMF provides the following three levels of bfb reproducibility ! support, with the associated performance optimization implications: ! ! \begin{itemize} ! ! \item Strict bit-for-bit reproducibility: Results are guaranteed to be ! bit-for-bit identical even when executing across different numbers of PETs. ! The optimization options are limited to memory layout and message aggregation. ! ! \item Relaxed bit-for-bit reproducibility: Results are only guaranteed to be ! bit-for-bit identical when running across an unchanged number of PETs. The ! optimization options include partial sums, allowing computational load to ! be balanced between source and destination PETs, and message sizes to be ! reduced. ! ! \item No guarantee for bit-for-bit reproducibility: Results may differ by ! numerical round-off. The optimization options include dynamic out-of-order ! summation of partial sums. ! ! \end{itemize} ! ! The following discussion uses very simple numerical examples to demonstrate ! how the order of terms in a sum can lead to results that are not ! bit-for-bit identical. The examples use single precision, ! {\tt ESMF\_KIND\_R4} numbers, but the concepts apply the same ! to double precision, {\tt ESMF\_KIND\_R8}; only that the decimals, for ! which bfb differences in the sums occur, are different ones. ! ! With {\tt sumA}, {\tt sumB}, {\tt sumC}, {\tt sumD}, and {\tt sumE} all of ! type {\tt real(ESMF\_KIND\_R4)}, one finds the following bfb differences: !EOE !BOC sumA = (0.5 + 0.1) + 0.1 ! results in 0.700000048 sumB = 0.5 + (0.1 + 0.1) ! results in 0.699999988 sumC = 0.5 + 0.2 + 0.1 + 0.1 ! results in 0.900000036 sumD = 0.5 + (0.2 + 0.1) + 0.1 ! results in 0.900000036 sumE = 0.5 + (0.2 + 0.1 + 0.1) ! results in 0.899999976 !EOC if (localPet == 0) then print *, "sumA = ", sumA print *, "sumB = ", sumB print *, "sumC = ", sumC print *, "sumD = ", sumD print *, "sumE = ", sumE endif !BOE ! These differences result from the fact that many decimals (even very simple ! ones like 0.1 or 0.2) lead to periodic binary floating point numbers. ! Periodic floating point numbers must be truncated when represented by a ! finite number of bits, leading to small rounding errors. Further truncation ! occurs when the radix point of two numbers must be aligned during ! floating point arithmetic, resulting in bit shifts for one of the ! numbers. The resulting truncation error depends on the precise numbers that ! need alignment. As a result, executing the "same" sum in a different order ! can lead to different truncation steps and consequently in results that are ! not bit-for-bit identical. ! ! In order to help users with the implementation of their bfb requirement, ! ESMF provides different levels of control over the term order in sparse ! matrix multiplications, while at the same time offering performance ! optimization options. In all there are {\em three} arguments that will be ! introduced in the following paragraphs: {\tt srcTermProcessing}, ! {\tt termorderflag}, and {\tt pipelineDepth}. ! ! For the purpose of demonstration, a one-dimensional, arbitrarily distributed ! source Array is constructed. There are three Array elements on each of the ! four PETs. Their local storage indices, sequence indices, and data values ! are as follows: ! ! \begin{verbatim} ! ! +-----+-------+----------------+------------+ ! | PET | index | sequence index | data value | ! +-----+-------+----------------+------------+ ! | 0 | 1 | 1 | 0.5 | ! | 0 | 2 | 6 | 0.1 | ! | 0 | 3 | 9 | 0.1 | ! +-----+-------+----------------+------------+ ! | 1 | 1 | 4 | 0.5 | ! | 1 | 2 | 3 | 0.1 | ! | 1 | 3 | 10 | 0.1 | ! +-----+-------+----------------+------------+ ! | 2 | 1 | 11 | 0.5 | ! | 2 | 2 | 7 | 0.1 | ! | 2 | 3 | 5 | 0.1 | ! +-----+-------+----------------+------------+ ! | 3 | 1 | 8 | 0.1 | ! | 3 | 2 | 2 | 0.2 | ! | 3 | 3 | 12 | 0.1 | ! +-----+-------+----------------+------------+ ! ! \end{verbatim} !EOE ! -- srcArray -- allocate(indexList(3)) if (localPet == 0) then indexList(1) = 1 indexList(2) = 6 indexList(3) = 9 elseif (localPet == 1) then indexList(1) = 4 indexList(2) = 3 indexList(3) = 10 elseif (localPet == 2) then indexList(1) = 11 indexList(2) = 7 indexList(3) = 5 elseif (localPet == 3) then indexList(1) = 8 indexList(2) = 2 indexList(3) = 12 endif distgrid = ESMF_DistGridCreate(indexList, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) srcArray = ESMF_ArrayCreate(distgrid=distgrid, typekind=ESMF_TYPEKIND_R4, & rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) call ESMF_ArrayGet(srcArray, farrayPtr=farrayPtr, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) do i=1, 3 select case (indexList(i)) case (1) farrayPtr(i) = 0.5_ESMF_KIND_R4 case (2) farrayPtr(i) = 0.2_ESMF_KIND_R4 case (3) farrayPtr(i) = 0.1_ESMF_KIND_R4 case (4) farrayPtr(i) = 0.5_ESMF_KIND_R4 case (5) farrayPtr(i) = 0.1_ESMF_KIND_R4 case (6) farrayPtr(i) = 0.1_ESMF_KIND_R4 case (7) farrayPtr(i) = 0.1_ESMF_KIND_R4 case (8) farrayPtr(i) = 0.1_ESMF_KIND_R4 case (9) farrayPtr(i) = 0.1_ESMF_KIND_R4 case (10) farrayPtr(i) = 0.1_ESMF_KIND_R4 case (11) farrayPtr(i) = 0.5_ESMF_KIND_R4 case (12) farrayPtr(i) = 0.1_ESMF_KIND_R4 end select enddo deallocate(indexList) ! call ESMF_ArrayPrint(srcArray) ! --------------------------------------------------------------------------- !BOE ! The destination Array consists of only a single element, located on PET 0: ! ! \begin{verbatim} ! ! +-----+-------+----------------+------------+ ! | PET | index | sequence index | data value | ! +-----+-------+----------------+------------+ ! | 0 | 1 | 1 | n/a | ! +-----+-------+----------------+------------+ ! ! \end{verbatim} !EOE ! -- dstArray -- if (localPet == 0) then allocate(indexList(1)) indexList(1) = 1 else allocate(indexList(0)) endif distgrid = ESMF_DistGridCreate(indexList, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) deallocate(indexList) dstArray = ESMF_ArrayCreate(distgrid=distgrid, typekind=ESMF_TYPEKIND_R4, & rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) call ESMF_ArrayGet(dstArray, farrayPtr=farrayPtr, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) ! --------------------------------------------------------------------------- ! -- sparse matrix -- smmElementCount = 0 if (localPet == 0) then smmElementCount = 3 endif allocate(factorIndexList(2,smmElementCount), factorList(smmElementCount)) ! --------------------------------------------------------------------------- !BOE ! As a first example consider the following sparse matrix with three entries: !EOE if (localPet == 0) then ! ! Place all src terms on the same PET demonstrating that the src term order ! is then defined by SRCSEQ. ! ! -> factors into dst element 1 on PET 0 ! * sum with 3 addends (order by src seq. index): 0.5 + 0.1 + 0.1 ! * addend 1 is on PET 0 ! * addend 2 is on PET 0 ! * addend 3 is on PET 0 ! All three addends are on the same PET, therefore src seq index order, ! independent of the order in which they are stored in memory on the src Pet ! Also order in which sparse matrix elements are given is irrelevant. ! (1,1)*s[1]Pet0 + (6,1)*s[6]Pet0 + (9,1)*s[9]Pet0 = ! 0.5 + 0.1 + 0.1 !BOC factorIndexList(1,1) = 1 ! src seq index factorIndexList(2,1) = 1 ! dst seq index factorList(1) = 1. factorIndexList(1,2) = 6 ! src seq index factorIndexList(2,2) = 1 ! dst seq index factorList(2) = 1. factorIndexList(1,3) = 9 ! src seq index factorIndexList(2,3) = 1 ! dst seq index factorList(3) = 1. !EOC endif !BOE ! In ESMF, the order in which the sparse matrix entries are specified in ! {\tt factorIndexList} and {\tt factorList}, or on which PET they ! are provided, is completely irrelevant. The term order in the resulting ! sparse matrix sums is not affected by it. ! ! There is one aspect of the sparse matrix format, however, that is relevant ! to the bfb considerations: When multiple entries for the same (src, dst) ! pair are present in a sparse matrix definition, the entries are summed ! into a single (src, dst) entry. Therefore, even if there are multiple ! sparse matrix entries for the same (src, dst) pair, there will only be a ! single term for it in the resulting expression. ! ! Going back to the three term sparse matrix definition above, the ! {\em canonical} term order is defined by the source sequence indices in ! ascending order. With {\tt (src,dst)} denoting the sparse matrix factors, ! and {\tt s(src)} and {\tt d(dst)} denoting source and destination Array ! elements, respectively, for {\tt src} and {\tt dst} sequence indices, the ! sum in canonical order is: ! ! d(1) = (1,1)*s(1) + (6,1)*s(6) + (9,1)*s(9) ! ! For simplicity, the factors in all of the examples are set to {\tt 1.0}, allowing us ! to drop them in the expressions. This helps focus on the critical issue -- ! term order: ! ! d(1) = s(1) + s(6) + s(9) ! ! \begin{sloppypar} ! There are two parameters that affect term order in the ESMF sparse matrix ! multiplication (SMM), and therefore must be considered in the context of bfb ! reproducibility. First there is the {\tt srcTermProcessing} parameter which ! controls grouping of source terms located on the same PET. The value of the ! {\tt srcTermProcessing} parameter indicates the maximum number of terms that ! may be grouped into partial sums on the source PET. Setting ! {\tt srcTermProcessing} to 1 means that no partial sums are formed on the ! source side, however, the source terms are multiplied with their ! respective sparse matrix factor before being sent to the destination PET. ! Setting {\tt srcTermProcessing} to 0 prevents these products from being carried ! out on the source side, and the source Array elements are sent unmodified. ! Depending on the distribution of the source Array, values greater than 1 ! for {\tt srcTermProcessing} can lead to partial sums and thus may have ! impact on the bfb reproducibility of the SMM. ! \end{sloppypar} ! ! The second parameter that may have bfb effects comes into play at ! execution-time of a precomputed ! RouteHandle. It is accessible via the {\tt termorderflag} argument; a typed ! flag with the following values: ! \begin{itemize} ! \item {\tt ESMF\_TERMORDER\_SRCSEQ} -- Strictly enforces the canonical order ! of the source terms according to the source sequence index. However, ! terms that are grouped together in the RouteHandle at store-time, as a ! consequence of {\tt srcTermProcessing}, are treated as ! single entities with a sequence index equal to the lowest original ! sequence index in the group. Use {\tt ESMF\_TERMORDER\_SRCSEQ} together ! with {\tt srcTermProcessing=0} or {\tt srcTermProcessing=1} when strict ! bfb reproducibility is required independent of the source Array ! distribution, e.g. for different number of PETs. ! \item {\tt ESMF\_TERMORDER\_SRCPET} -- The source terms in the sum are ! first arranged according to the relative position of the PET on which ! they reside with respect to the destination PET. Second, all the terms ! coming from the same PET are sorted in canonical sequence index order ! and summed into partial sums. Again, terms that are grouped together ! in the RouteHandle at store-time are treated as ! single entities with a sequence index equal to the lowest original ! sequence index in the group. The final result for each destination ! element is determined by adding the partial sums in an order that is ! fixed by the position of the partial sums' source PETs relative to ! the destination PET. This ensures bfb reproducibility of the result as ! long as the number of PETs remains unchanged. ! \item {\tt ESMF\_TERMORDER\_FREE} -- For this option there are no ! restrictions on the term ! order. Terms can be summed in any order, and the order may change each ! time the RouteHandle is executed. This option grants greatest flexibility ! to the RouteHandle execution implementation. It is available for all the ! methods that take the {\tt termorderflag} argument. Without a ! guaranteed source term order, the {\tt ESMF\_TERMORDER\_FREE} option is ! not suitable for situations that require bfb reproducibility. ! \end{itemize} !EOE !BOE ! {\bf ESMF\_TERMORDER\_SRCSEQ} ! ! First using {\tt srcTermProcessing=0} at store time and ! {\tt termorderflag=ESMF\_TERMORDER\_SRCSEQ} at execution time, ! the canonical term order is expected: ! ! d(1) = s(1) + s(6) + s(9) = 0.5 + 0.1 + 0.1 = sumA ! !EOE !BOC ! forced srcTermProcessing srcTermProcessing = 0 call ESMF_ArraySMMStore(srcArray, dstArray, & factorIndexList=factorIndexList, factorList=factorList, & routehandle=rh, srcTermProcessing=srcTermProcessing, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC call ESMF_ArraySMM(srcArray, dstArray, routehandle=rh, & termorderflag=ESMF_TERMORDER_SRCSEQ, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC if (localPet == 0) then print *, "result SRCSEQ#1 = ", farrayPtr(1), " expect: ", sumA if (farrayPtr(1) /= sumA) & finalrc = ESMF_FAILURE endif !EOC call ESMF_RouteHandlePrint(rh, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) call ESMF_ArraySMMRelease(rh, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOE ! The order of source terms across PETs is expected to have no effect on the ! bfb reproducibility of the result for {\tt ESMF\_TERMORDER\_SRCSEQ}. To test ! this, a sparse matrix is used where the source terms originate from different ! PETs. !EOE if (localPet == 0) then ! ! Place the src terms across PETs so that SRCPET order results ! in a different order than SRCSEQ - and resulting numerical differences. ! ! -> factors into dst element 1 on PET 0 ! * sum with 3 addends (order by src seq. index): 0.5 + 0.1 + 0.1 ! * addend 1 is on PET 1 ! * addend 2 is on PET 2 ! * addend 3 is on PET 3 ! The "SRCPET" order of the three addends is given by the cyclic scheme ! src PET scheme, starting at dstPet, going to dstPet-petCount, modulo ! petCount. With that the sum order is this: ! (12,1)*s[12]Pet3 + (5,1)*s[5]Pet2 + (4,1)*s[4]Pet1 = ! 0.1 + 0.1 + 0.5 !BOC factorIndexList(1,1) = 4 ! src seq index factorIndexList(2,1) = 1 ! dst seq index factorList(1) = 1. factorIndexList(1,2) = 5 ! src seq index factorIndexList(2,2) = 1 ! dst seq index factorList(2) = 1. factorIndexList(1,3) = 12 ! src seq index factorIndexList(2,3) = 1 ! dst seq index factorList(3) = 1. !EOC endif !BOE ! Again the {\tt srcTermProcessing} argument is kept at 0, ensuring that none ! of the source terms are grouped into partial sums. ! !EOE !BOC ! forced srcTermProcessing srcTermProcessing = 0 call ESMF_ArraySMMStore(srcArray, dstArray, & factorIndexList=factorIndexList, factorList=factorList, & routehandle=rh, srcTermProcessing=srcTermProcessing, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC call ESMF_ArraySMM(srcArray, dstArray, routehandle=rh, & termorderflag=ESMF_TERMORDER_SRCSEQ, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOE ! Under {\tt ESMF\_TERMORDER\_SRCSEQ} it does not matter on which PET a ! source term is located, the order of source terms is strictly defined by the ! order of source sequence indices: ! ! d(1) = s(4) + s(5) + s(12) = 0.5 + 0.1 + 0.1 = sumA ! !EOE !BOC if (localPet == 0) then print *, "result SRCSEQ#2 = ", farrayPtr(1), " expect: ", sumA if (farrayPtr(1) /= sumA) & finalrc = ESMF_FAILURE endif !EOC call ESMF_ArraySMMRelease(rh, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOE ! The same sparse matrix leads to bfb differences in the result when executed ! with the {\tt ESMF\_TERMORDER\_SRCPET} option. This is demonstrated further ! down in result {\tt SRCPET\#4}. !EOE !BOE ! {\bf ESMF\_TERMORDER\_SRCPET} ! ! {\bf All source terms coming from the same PET} ! ! In the following examples the {\tt srcTermProcessing} argument at store-time ! is first set to 0, forcing all of the source terms to be sent to the ! destination PET unmodified. We start by going back to the initial sparse ! matrix where all of the source terms are located on the same PET. !EOE if (localPet == 0) then ! ! Place all src terms on the same PET demonstrating that the src term order ! is then defined by SRCSEQ. ! ! -> factors into dst element 1 on PET 0 ! * sum with 3 addends (order by src seq. index): 0.5 + 0.1 + 0.1 ! * addend 1 is on PET 0 ! * addend 2 is on PET 0 ! * addend 3 is on PET 0 ! All three addends are on the same PET, therefore src seq index order, ! independent of the order in which they are stored in memory on the src Pet ! Also order in which sparse matrix elements are given is irrelevant. ! (1,1)*s[1]Pet0 + (6,1)*s[6]Pet0 + (9,1)*s[9]Pet0 = ! 0.5 + 0.1 + 0.1 !BOC factorIndexList(1,1) = 1 ! src seq index factorIndexList(2,1) = 1 ! dst seq index factorList(1) = 1. factorIndexList(1,2) = 6 ! src seq index factorIndexList(2,2) = 1 ! dst seq index factorList(2) = 1. factorIndexList(1,3) = 9 ! src seq index factorIndexList(2,3) = 1 ! dst seq index factorList(3) = 1. !EOC endif !BOC ! forced srcTermProcessing srcTermProcessing=0 call ESMF_ArraySMMStore(srcArray, dstArray, & factorIndexList=factorIndexList, factorList=factorList, & routehandle=rh, srcTermProcessing=srcTermProcessing, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOE ! Then, at execution time, the {\tt ESMF\_TERMORDER\_SRCPET} option is used. !EOE !BOC call ESMF_ArraySMM(srcArray, dstArray, routehandle=rh, & termorderflag=ESMF_TERMORDER_SRCPET, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOE ! Here all of the source elements originate from the same PET (PET 0). This ! fact, together with the {\tt ESMF\_TERMORDER\_SRCPET} execution-time option, ! results in the following canonical term order: ! ! d(1) = s(1) + s(6) + s(9) = 0.5 + 0.1 + 0.1 = sumA ! ! This is exactly the same term order that was used above to produce the ! result stored in {\tt sumA}. !EOE !BOC if (localPet == 0) then print *, "result SRCPET#1 = ", farrayPtr(1), " expect: ", sumA if (farrayPtr(1) /= sumA) & finalrc = ESMF_FAILURE endif !EOC call ESMF_ArraySMMRelease(rh, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) ! --------------------------------------------------------------------------- !BOE ! The sequence indices of the source terms are the only relevant aspect in ! determining the source term order. Consider, for example, the following ! sparse matrix, where again all source terms are located on the same PET ! (PET 2): !EOE if (localPet == 0) then ! ! Place all src terms on the same PET demonstrating that the src term order ! is then defined by SRCSEQ. ! ! -> factors into dst element 1 on PET 0 ! * sum with 3 addends (order by src seq. index): 0.1 + 0.1 + 0.5 ! * addend 1 is on PET 2 ! * addend 2 is on PET 2 ! * addend 3 is on PET 2 ! All three addends are on the same PET, therefore src seq index order, ! independent of the order in which they are stored in memory on the src Pet ! Also order in which sparse matrix elements are given is irrelevant. ! (5,1)*s[5]Pet2 + (7,1)*s[7]Pet2 + (11,1)*s[11]Pet2 = ! 0.1 + 0.1 + 0.5 !BOC factorIndexList(1,1) = 11 ! src seq index factorIndexList(2,1) = 1 ! dst seq index factorList(1) = 1. factorIndexList(1,2) = 5 ! src seq index factorIndexList(2,2) = 1 ! dst seq index factorList(2) = 1. factorIndexList(1,3) = 7 ! src seq index factorIndexList(2,3) = 1 ! dst seq index factorList(3) = 1. !EOC endif !BOE ! This time the source term order in memory is not the same ! as their sequence index order. Specifically, the sequence indices of the ! source terms, in the order they are stored in memory, is 11, 7, 5 (see the ! source Array diagram above for reference). ! Further, as mentioned already, the order of entries in the sparse matrix ! also have not bearing on the term order in the SMM sums. ! Then, for the {\tt ESMF\_TERMORDER\_SRCPET} option, and because all source ! terms are located on the same PET, the resulting source term order is the ! canonical one determined by the source term sequence indices alone: ! ! d(1) = s(5) + s(7) + s(11) ! ! Filling in the source element data, we find ! ! d(1) = 0.1 + 0.1 + 0.5, ! ! which is expected to be bfb equivalent to the result stored in {\tt sumB} ! from above. !EOE !BOC ! forced srcTermProcessing srcTermProcessing=0 call ESMF_ArraySMMStore(srcArray, dstArray, & factorIndexList=factorIndexList, factorList=factorList, & routehandle=rh, srcTermProcessing=srcTermProcessing, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC call ESMF_ArraySMM(srcArray, dstArray, routehandle=rh, & termorderflag=ESMF_TERMORDER_SRCPET, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC if (localPet == 0) then print *, "result SRCPET#2 = ", farrayPtr(1), " expect: ", sumB if (farrayPtr(1) /= sumB) & finalrc = ESMF_FAILURE endif !EOC call ESMF_ArraySMMRelease(rh, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) ! --------------------------------------------------------------------------- !BOE ! {\bf Source terms coming from different PETs} ! ! When the source terms are distributed across multiple PETs, the ! {\tt ESMF\_TERMORDER\_SRCPET} option first bundles the terms according to ! the PET on which they are stored. These source term "bundles" are then ! arranged in an order that depends on the source PET position relative to the ! destination PET: starting with the bundle for which the source PET is the ! same as the destination PET, the source term bundles are placed in descending ! order with respect to their source PET, modulo petCount. The terms within ! each source term bundle are further sorted in the canonical order according ! to their sequence index. ! ! The following sparse matrix demonstrates the effect of the ! {\tt ESMF\_TERMORDER\_SRCPET} option. !EOE if (localPet == 0) then ! ! Placing src terms on PETs so that SRCSEQ and SRCPET order are identical. ! ! -> factors into dst element 1 on PET 0 ! * sum with 3 addends (order by src seq. index): 0.5 + 0.1 + 0.1 ! * addend 1 is on PET 0 ! * addend 2 is on PET 1 ! * addend 3 is on PET 2 ! The "SRCPET" order of the three addends is given by the cyclic scheme ! src PET scheme, starting at dstPet, going to dstPet-petCount, modulo ! petCount. With that the sum order is this: ! (1,1)*s[1]Pet0 + (7,1)*s[7]Pet2 + (3,1)*s[3]Pet1 = ! 0.5 + 0.1 + 0.1 !BOC factorIndexList(1,1) = 1 ! src seq index factorIndexList(2,1) = 1 ! dst seq index factorList(1) = 1. factorIndexList(1,2) = 3 ! src seq index factorIndexList(2,2) = 1 ! dst seq index factorList(2) = 1. factorIndexList(1,3) = 7 ! src seq index factorIndexList(2,3) = 1 ! dst seq index factorList(3) = 1. !EOC endif !BOE ! Here the source terms are located on PETs 0, 1, and 2. Using a [] notion to ! indicate the source PET of each term, the term order under ! {\tt ESMF\_TERMORDER\_SRCPET} is given by: ! ! d(1) = s(1)[0] + s(7)[2] + s(3)[1] = 0.5 + 0.1 + 0.1 ! ! This is again the same order of terms that was used to produce the result ! stored in {\tt sumA} above. !EOE !BOC ! forced srcTermProcessing srcTermProcessing=0 call ESMF_ArraySMMStore(srcArray, dstArray, & factorIndexList=factorIndexList, factorList=factorList, & routehandle=rh, srcTermProcessing=srcTermProcessing, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC call ESMF_ArraySMM(srcArray, dstArray, routehandle=rh, & termorderflag=ESMF_TERMORDER_SRCPET, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC if (localPet == 0) then print *, "result SRCPET#3 = ", farrayPtr(1), " expect: ", sumA if (farrayPtr(1) /= sumA) & finalrc = ESMF_FAILURE endif !EOC call ESMF_ArraySMMRelease(rh, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) ! --------------------------------------------------------------------------- !BOE ! In the above example, the fact that the terms were ordered by source PET ! first, did not lead to numerical bfb differences compared to the canonical ! source term order. However, this was purely coincidental in the way the ! numbers worked out for this example. The following case looks at a situation ! where the source PET order {\em does} lead to a result that shows bfb ! differences compared to the canonical term order. !EOE if (localPet == 0) then ! ! This time place the src terms across PETs so that SRCPET order results ! in a different order than SRCSEQ - and resulting numerical differences. ! ! -> factors into dst element 1 on PET 0 ! * sum with 3 addends (order by src seq. index): 0.5 + 0.1 + 0.1 ! * addend 1 is on PET 1 ! * addend 2 is on PET 2 ! * addend 3 is on PET 3 ! The "SRCPET" order of the three addends is given by the cyclic scheme ! src PET scheme, starting at dstPet, going to dstPet-petCount, modulo ! petCount. With that the sum order is this: ! (12,1)*s[12]Pet3 + (5,1)*s[5]Pet2 + (4,1)*s[4]Pet1 = ! 0.1 + 0.1 + 0.5 !BOC factorIndexList(1,1) = 4 ! src seq index factorIndexList(2,1) = 1 ! dst seq index factorList(1) = 1. factorIndexList(1,2) = 5 ! src seq index factorIndexList(2,2) = 1 ! dst seq index factorList(2) = 1. factorIndexList(1,3) = 12 ! src seq index factorIndexList(2,3) = 1 ! dst seq index factorList(3) = 1. !EOC endif !BOE ! The canonical source term order of this SMM sum, determined by the source ! sequence indices alone, is: ! ! d(1) = s(4) + s(5) + s(12) = 0.5 + 0.1 + 0.1, ! ! which again would lead to a result that is bfb identical to {\tt sumA}. ! However, this is not the term order resulting from the ! {\tt ESMF\_TERMORDER\_SRCPET} option. The actual order for this option is: ! ! d(1) = s(12)[3] + s(5)[2] + s(4)[1] = 0.1 + 0.1 + 0.5, ! ! resulting in a sum that is bfb identical to {\tt sumB} instead. !EOE !BOC ! forced srcTermProcessing srcTermProcessing=0 call ESMF_ArraySMMStore(srcArray, dstArray, & factorIndexList=factorIndexList, factorList=factorList, & routehandle=rh, srcTermProcessing=srcTermProcessing, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC call ESMF_ArraySMM(srcArray, dstArray, routehandle=rh, & termorderflag=ESMF_TERMORDER_SRCPET, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC if (localPet == 0) then print *, "result SRCPET#4 = ", farrayPtr(1), " expect: ", sumB if (farrayPtr(1) /= sumB) & finalrc = ESMF_FAILURE endif !EOC call ESMF_ArraySMMRelease(rh, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) ! --------------------------------------------------------------------------- !BOE ! {\bf Grouping of source terms coming from the same PET} ! ! So far the {\tt srcTermProcessing} argument was kept at 0, and therefore ! source term grouping had not to be considered. Source term grouping is only ! possible for terms that originate from the same PET. In preparation ! for a closer look at the bfb effects of source term grouping, consider a ! sparse matrix where two of the source terms are located on the same PET. !EOE ! start looking into srcTermProcessing if (localPet == 0) then ! ! Place the last two terms on the same PET, but still have srcTermProcessing ! at 0, therefore not different from having them on different PETs. ! ! -> factors into dst element 1 on PET 0 ! * sum with 3 addends (order by src seq. index): 0.5 + 0.1 + 0.1 ! * addend 1 is on PET 0 ! * addend 2 is on PET 2 ! * addend 3 is on PET 2 ! The "SRCPET" order of the three addends is given by the cyclic scheme ! src PET scheme, starting at dstPet, going to dstPet-petCount, modulo ! petCount. With that the sum order is this: ! (1,1)*s[1]Pet0 + (5,1)*s[5]Pet2 + (7,1)*s[7]Pet2 = ! 0.5 + 0.1 + 0.1 !BOC factorIndexList(1,1) = 1 ! src seq index factorIndexList(2,1) = 1 ! dst seq index factorList(1) = 1. factorIndexList(1,2) = 5 ! src seq index factorIndexList(2,2) = 1 ! dst seq index factorList(2) = 1. factorIndexList(1,3) = 7 ! src seq index factorIndexList(2,3) = 1 ! dst seq index factorList(3) = 1. !EOC endif !BOE ! Here one of the source terms is located on PET 0 while the other two ! source terms are originating on PET 2. Keeping the {\tt srcTermProcessing} ! argument at 0 first, the term order under {\tt ESMF\_TERMORDER\_SRCPET} is ! given by: ! ! d(1) = s(1)[0] + s(5)[2] + s(7)[2] = 0.5 + 0.1 + 0.1 ! ! And again the result is expected to be bfb identical to the number stored ! in {\tt sumA}. !EOE !BOC ! forced srcTermProcessing srcTermProcessing=0 call ESMF_ArraySMMStore(srcArray, dstArray, & factorIndexList=factorIndexList, factorList=factorList, & routehandle=rh, srcTermProcessing=srcTermProcessing, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC call ESMF_ArraySMM(srcArray, dstArray, routehandle=rh, & termorderflag=ESMF_TERMORDER_SRCPET, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC if (localPet == 0) then print *, "result SRCPET#5 = ", farrayPtr(1), " expect: ", sumA if (farrayPtr(1) /= sumA) & finalrc = ESMF_FAILURE endif !EOC call ESMF_ArraySMMRelease(rh, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) ! --------------------------------------------------------------------------- !BOE ! The same result is also expected with {\tt srcTermProcessing} set to 1. A ! value of 1 indicates that the multiplication of the source term with its ! sparse matrix factor is carried out on the source side before being sent to ! the destination PET. The final sum is still carried out in the same order on ! the destination PET, essentially resulting in the exact same bfb identical ! sum as for {\tt srcTermProcessing} set to 0. !EOE !BOC ! forced srcTermProcessing srcTermProcessing=1 call ESMF_ArraySMMStore(srcArray, dstArray, & factorIndexList=factorIndexList, factorList=factorList, & routehandle=rh, srcTermProcessing=srcTermProcessing, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC call ESMF_ArraySMM(srcArray, dstArray, routehandle=rh, & termorderflag=ESMF_TERMORDER_SRCPET, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC if (localPet == 0) then print *, "result SRCPET#6 = ", farrayPtr(1), " expect: ", sumA if (farrayPtr(1) /= sumA) & finalrc = ESMF_FAILURE endif !EOC call ESMF_ArraySMMRelease(rh, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) ! --------------------------------------------------------------------------- !BOE ! Increasing the {\tt srcTermProcessing} argument to 2 (or higher) results in ! source term grouping of the terms (up to the number specified in ! {\tt srcTermProcessing}) that are on the same source PET. ! ! d(1) = s(1)[0] + ( s(5)[2] + s(7)[2] ) = 0.5 + (0.1 + 0.1) ! ! This result is bfb identical to first adding 0.1 and 0.1 into a partial sum, ! and then adding this sum to 0.5. This is the exact grouping of ! terms that was used to obtain the result stored in {\tt sumB} from above. !EOE !BOC ! forced srcTermProcessing srcTermProcessing=2 call ESMF_ArraySMMStore(srcArray, dstArray, & factorIndexList=factorIndexList, factorList=factorList, & routehandle=rh, srcTermProcessing=srcTermProcessing, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC call ESMF_ArraySMM(srcArray, dstArray, routehandle=rh, & termorderflag=ESMF_TERMORDER_SRCPET, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC if (localPet == 0) then print *, "result SRCPET#7 = ", farrayPtr(1), " expect: ", sumB if (farrayPtr(1) /= sumB) & finalrc = ESMF_FAILURE endif !EOC call ESMF_ArraySMMRelease(rh, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) ! --------------------------------------------------------------------------- ! the following examples will have 4 src terms deallocate(factorIndexList, factorList) smmElementCount = 0 if (localPet == 0) then smmElementCount = 4 endif allocate(factorIndexList(2,smmElementCount), factorList(smmElementCount)) ! --------------------------------------------------------------------------- !BOE ! In order to explore the effects of the {\tt srcTermProcessing} argument ! further, more terms on the same source PET are needed in the SMM sum. ! The following sparse matrix has four entries, three of which originate from ! the same PET (PET 3). !EOE if (localPet == 0) then ! ! First with srcTermProcessing set to 0 ! ! -> factors into dst element 1 on PET 0 ! * sum with 4 addends (order by src seq. index): 0.5 + 0.2 + 0.1 + 0.1 ! * addend 1 is on PET 0 ! * addend 2 is on PET 3 ! * addend 3 is on PET 3 ! * addend 4 is on PET 3 ! The "SRCPET" order of the three addends is given by the cyclic scheme ! src PET scheme, starting at dstPet, going to dstPet-petCount, modulo ! petCount. With that the sum order is this: ! (1,1)*s[1]Pet0 + (2,1)*s[2]Pet3 + (8,1)*s[8]Pet3 + (12,1)*s[12]Pet3 = ! 0.5 + 0.2 + 0.1 + 0.1 !BOC factorIndexList(1,1) = 1 ! src seq index factorIndexList(2,1) = 1 ! dst seq index factorList(1) = 1. factorIndexList(1,2) = 2 ! src seq index factorIndexList(2,2) = 1 ! dst seq index factorList(2) = 1. factorIndexList(1,3) = 8 ! src seq index factorIndexList(2,3) = 1 ! dst seq index factorList(3) = 1. factorIndexList(1,4) = 12 ! src seq index factorIndexList(2,4) = 1 ! dst seq index factorList(4) = 1. !EOC endif !BOE ! Setting the {\tt srcTermProcessing} argument back to 0 puts the terms in ! PET order, and canonical order for each PET bundle. ! ! d(1) = s(1)[0] + s(2)[3] + s(8)[3] + s(12)[3] = 0.5 + 0.2 + 0.1 + 0.1 ! ! The bfb identical result for this sum was calculated and stored in variable ! {\tt sumC} above. !EOE !BOC ! forced srcTermProcessing srcTermProcessing=0 call ESMF_ArraySMMStore(srcArray, dstArray, & factorIndexList=factorIndexList, factorList=factorList, & routehandle=rh, srcTermProcessing=srcTermProcessing, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC call ESMF_ArraySMM(srcArray, dstArray, routehandle=rh, & termorderflag=ESMF_TERMORDER_SRCPET, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC if (localPet == 0) then print *, "result SRCPET#8 = ", farrayPtr(1), " expect: ", sumC if (farrayPtr(1) /= sumC) & finalrc = ESMF_FAILURE endif !EOC call ESMF_ArraySMMRelease(rh, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) ! --------------------------------------------------------------------------- !BOE ! Setting the {\tt srcTermProcessing} argument to a value of 2 results in the ! following source term grouping: ! ! d(1) = s(1)[0] + ( s(2)[3] + s(8)[3] ) + s(12)[3] ! = 0.5 + ( 0.2 + 0.1 ) + 0.1, ! ! where the (0.2 + 0.1) partial sum is carried out on source PET 3, and ! then sent to the destination PET (PET 0), together with the unmodified data ! from source element 8 (0.1). The final sum is performed on PET 0. The ! result is identical to the precomputed value stored in {\tt sumD}. The ! numbers work out in a way where this result is bfb identical to the ! previous result, i.e. {\tt sumC}. However, this bfb match is purely ! coincidental. !EOE ! ! Next with srcTermProcessing set to 2, changing the term order, but no ! bfb difference here. ! ! -> factors into dst element 1 on PET 0 ! * sum with 4 addends (order by src seq. index): 0.5 + 0.2 + 0.1 + 0.1 ! * addend 1 is on PET 0 ! * addend 2 is on PET 3 ! * addend 3 is on PET 3 ! * addend 4 is on PET 3 ! The "SRCPET" order of the three addends is given by the cyclic scheme ! src PET scheme, starting at dstPet, going to dstPet-petCount, modulo ! petCount. With that the sum order is this: ! (1,1)*s[1]Pet0 + ( (2,1)*s[2]Pet3 + (8,1)*s[8]Pet3 ) + (12,1)*s[12]Pet3 = ! 0.5 + ( 0.2 + 0.1 ) + 0.1 !BOC ! forced srcTermProcessing srcTermProcessing=2 call ESMF_ArraySMMStore(srcArray, dstArray, & factorIndexList=factorIndexList, factorList=factorList, & routehandle=rh, srcTermProcessing=srcTermProcessing, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC call ESMF_ArraySMM(srcArray, dstArray, routehandle=rh, & termorderflag=ESMF_TERMORDER_SRCPET, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC if (localPet == 0) then print *, "result SRCPET#9 = ", farrayPtr(1), " expect: ", sumD if (farrayPtr(1) /= sumD) & finalrc = ESMF_FAILURE endif !EOC call ESMF_ArraySMMRelease(rh, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) ! --------------------------------------------------------------------------- !BOE ! Increasing the {\tt srcTermProcessing} argument up to 3 results in a three ! term partial sum on PET 3: ! ! d(1) = s(1)[0] + ( s(2)[3] + s(8)[3] + s(12)[3] ) ! = 0.5 + ( 0.2 + 0.1 + 0.1 ). ! ! Again the final sum is performed on PET 0. The result is bfb identical to ! the number stored in {\tt sumE}, which, for the chosen numbers, works out to ! have a bfb difference compared to {\tt sumC} and {\tt sumD}. !EOE ! ! Next with srcTermProcessing set to 3, changing the term order, this time ! with bfb effect. ! ! -> factors into dst element 1 on PET 0 ! * sum with 4 addends (order by src seq. index): 0.5 + 0.2 + 0.1 + 0.1 ! * addend 1 is on PET 0 ! * addend 2 is on PET 3 ! * addend 3 is on PET 3 ! * addend 4 is on PET 3 ! The "SRCPET" order of the three addends is given by the cyclic scheme ! src PET scheme, starting at dstPet, going to dstPet-petCount, modulo ! petCount. With that the sum order is this: ! (1,1)*s[1]Pet0 + ( (2,1)*s[2]Pet3 + (8,1)*s[8]Pet3 + (12,1)*s[12]Pet3 ) = ! 0.5 + ( 0.2 + 0.1 + 0.1 ) !BOC ! forced srcTermProcessing srcTermProcessing=3 call ESMF_ArraySMMStore(srcArray, dstArray, & factorIndexList=factorIndexList, factorList=factorList, & routehandle=rh, srcTermProcessing=srcTermProcessing, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC call ESMF_ArraySMM(srcArray, dstArray, routehandle=rh, & termorderflag=ESMF_TERMORDER_SRCPET, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC if (localPet == 0) then print *, "result SRCPET#10 = ", farrayPtr(1), " expect: ", sumE if (farrayPtr(1) /= sumE) & finalrc = ESMF_FAILURE endif !EOC call ESMF_ArraySMMRelease(rh, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) ! --------------------------------------------------------------------------- !BOE ! {\bf Reproducibility and Performance} ! ! The above examples show how bit-for-bit (bfb) reproducibility is a result of ! controlling the term order. ESMF offers several options to control the term ! order in the sparse matrix multiplication (SMM) implementation: ! \begin{sloppypar} ! \begin{itemize} ! \item To guarantee bfb reproducibility between consecutive executions of the ! same RouteHandle object, the {\tt ESMF\_TERMORDER\_SRCPET} execution-time ! option suffices. ! \item If bfb reproducibility is required between {\em different} RouteHandles, ! e.g. a RouteHandle that is precomputed each time the application starts, ! then it must be further ensured that the same value of {\tt srcTermProcessing} ! is specified during the store call. Under these conditions the ESMF SMM ! implementation guarantees bfb identical results between runs, as long as the ! number of PETs does not change. ! \item To guarantee bfb reproducibility between different runs, even when the ! number of PETs, and therefore the data distribution changes, the execution ! option {\tt ESMF\_TERMORDER\_SRCSEQ} must be chosen together with ! {\tt srcTermProcessing} equal to 0 or 1 (in order to prevent partial sums). ! \end{itemize} ! \end{sloppypar} ! ! The term order in a SMM operation does not only affect the bfb ! reproducibility of the result, but also affects the SMM {\em performance}. ! The precise performance implications of a specific term order are ! complicated and strongly depend on the exact problem structure, as well as ! on the details of the compute hardware. ESMF implements an auto-tuning ! mechanism that can be used to conveniently determine a close to optimal set ! of SMM performance parameters. ! ! There are two SMM performance parameters in ESMF that are encoded into a ! RouteHandle during store-time: {\tt srcTermProcessing} and ! {\tt pipelineDepth}. The first one affects the term order in the SMM sums and ! has been discussed in detail above. The second parameter, {\tt pipelineDepth}, ! determines how many in- and out-bound messages may be outstanding on each ! PET. It has no effect on the term order and does not lead to bfb differences ! in the SMM results. However, in order to achieve good performance ! reproducibility, the user has the option to pass in a fixed value of the ! {\tt pipelineDepth} argument when precomputing RouteHandles. ! ! Store calls that take the {\tt srcTermProcessing} and/or {\tt pipelineDepth} ! argument specify them as {\tt optional} with {\tt intent(inout)}. Omitting the ! argument when calling, or passing a variable that is set to a negative ! number, indicates that the respective parameter needs to be determined by ! the library. Further, if a variable with a negative value was passed in, then ! the variable is overwritten and replaced by the auto-tuned value on return. Through ! this mechanism a user can leverage the built-in auto-tuning feature of ESMF to ! obtain the best possible performance for a specific problem on a particular ! compute hardware, while still ensuring bfb and performance ! reproducibility between runs. The following example shows code that first ! checks if previously stored SMM performance parameters are available in a ! file on disk, and then either reads and uses them, or else uses auto-tuning ! to determine the parameters before writing them to file. For simplicity the ! same sparse matrix as in the previous example is used. ! !EOE ! repeat the following code a few times to test bfb between SMM with fixed ! parameters do i=1,5 !BOC ! precondition the arguments for auto-tuning and overwriting srcTermProcessing = -1 ! init negative value pipelineDepth = -1 ! init negative value ! get a free Fortran i/o unit call ESMF_UtilIOUnitGet(unit=iounit, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC ! try to open the file that holds the SMM parameters open(unit=iounit, file="smmParameters.dat", status="old", action="read", & form="unformatted", iostat=iostat) if (iostat == 0) then ! the file was present -> read from it and close it again read(unit=iounit, iostat=iostat) srcTermProcessing, pipelineDepth, & sumCompare close(unit=iounit) endif if ((localPet == 0) .and. (iostat == 0)) then print *, "SMM parameters successfully read from file" print *, " srcTermProcessing=", srcTermProcessing, " pipelineDepth=", & pipelineDepth, " ==>> sumCompare=", sumCompare endif call ESMF_ArraySMMStore(srcArray, dstArray, & factorIndexList=factorIndexList, factorList=factorList, & routehandle=rh, srcTermProcessing=srcTermProcessing, & pipelineDepth=pipelineDepth, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC call ESMF_ArraySMM(srcArray, dstArray, routehandle=rh, & termorderflag=ESMF_TERMORDER_SRCPET, rc=rc) !EOC if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !BOC if ((localPet == 0) .and. (iostat /= 0)) then print *, "SMM parameters determined via auto-tuning -> dump to file" open(unit=iounit, file="smmParameters.dat", status="unknown", & action="write", form="unformatted") write(unit=iounit) srcTermProcessing, pipelineDepth, farrayPtr(1) close(unit=iounit) endif if (localPet == 0) then if (iostat /= 0) then ! cannot do bfb comparison of the result without reference print *, "result SRCPET#11 = ", farrayPtr(1) else ! do bfb comparison of the result against reference print *, "result SRCPET#11 = ", farrayPtr(1), " expect: ", sumCompare if (farrayPtr(1) /= sumCompare) then finalrc = ESMF_FAILURE write (msg, *) "Numerical difference detected: ", & farrayPtr(1)-sumCompare call ESMF_LogWrite(msg, ESMF_LOGMSG_INFO) endif endif endif !EOC ! barrier ensures that file is written before any PET tries to read call ESMF_VMBarrier(vm, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) ! release RH before the next pre-compute call ESMF_ArraySMMRelease(rh, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) enddo !BOE ! Running this example for the first time exercises the auto-tuning branch. The ! auto-tuned {\tt srcTermProcessing} and {\tt pipelineDepth} parameters are ! then used in the SMM execution, as well as written to file. The SMM result ! variable is also written to the same file for test purposes. ! Any subsequent execution of the same example branches into the code that ! reads the previously determined SMM execution parameters from file, re-using ! them during store-time. This ensures bfb reproducibility of the SMM result, ! which is tested in this example by comparing to the previously stored value. !EOE ! --------------------------------------------------------------------------- !--------- extra test for ESMF_TERMORDER_SRCSEQ --------- srcTermProcessing = 2 call ESMF_ArraySMMStore(srcArray, dstArray, & factorIndexList=factorIndexList, factorList=factorList, & routehandle=rh, srcTermProcessing=srcTermProcessing, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) call ESMF_ArraySMM(srcArray, dstArray, routehandle=rh, & termorderflag=ESMF_TERMORDER_SRCSEQ, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) if (localPet == 0) then print *, "result SRCSEQ#3 = ", farrayPtr(1), " expect: ", sumC if (farrayPtr(1) /= sumC) & finalrc = ESMF_FAILURE endif call ESMF_ArraySMMRelease(rh, rc=rc) if (ESMF_LogFoundError(rcToCheck=rc, msg=ESMF_LOGERR_PASSTHRU, & line=__LINE__, & file=__FILE__)) & call ESMF_Finalize(endflag=ESMF_END_ABORT) !--------- end extra stuff for now --------- deallocate(factorIndexList, factorList) 10 continue ! IMPORTANT: ESMF_STest() prints the PASS string and the # of processors in the log ! file that the scripts grep for. call ESMF_STest((finalrc.eq.ESMF_SUCCESS), testname, failMsg, result, ESMF_SRCLINE) call ESMF_Finalize(rc=rc) if (rc/=ESMF_SUCCESS) finalrc = ESMF_FAILURE if (finalrc==ESMF_SUCCESS) then print *, "PASS: ESMF_RHandleBitForBitEx.F90" else print *, "FAIL: ESMF_RHandleBitForBitEx.F90" endif end program