/* Copyright (C) 2000-2024 Free Software Foundation, Inc. Contributed by James E. Wilson . This file is part of GCC. GCC is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Under Section 7 of GPL version 3, you are granted additional permissions described in the GCC Runtime Library Exception, version 3.1, as published by the Free Software Foundation. You should have received a copy of the GNU General Public License and a copy of the GCC Runtime Library Exception along with this program; see the files COPYING3 and COPYING.RUNTIME respectively. If not, see . */ #ifdef L__divxf3 // Compute a 80-bit IEEE double-extended quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // farg0 holds the dividend. farg1 holds the divisor. // // __divtf3 is an alternate symbol name for backward compatibility. .text .align 16 .global __divxf3 .proc __divxf3 __divxf3: #ifdef SHARED .global __divtf3 __divtf3: #endif cmp.eq p7, p0 = r0, r0 frcpa.s0 f10, p6 = farg0, farg1 ;; (p6) cmp.ne p7, p0 = r0, r0 .pred.rel.mutex p6, p7 (p6) fnma.s1 f11 = farg1, f10, f1 (p6) fma.s1 f12 = farg0, f10, f0 ;; (p6) fma.s1 f13 = f11, f11, f0 (p6) fma.s1 f14 = f11, f11, f11 ;; (p6) fma.s1 f11 = f13, f13, f11 (p6) fma.s1 f13 = f14, f10, f10 ;; (p6) fma.s1 f10 = f13, f11, f10 (p6) fnma.s1 f11 = farg1, f12, farg0 ;; (p6) fma.s1 f11 = f11, f10, f12 (p6) fnma.s1 f12 = farg1, f10, f1 ;; (p6) fma.s1 f10 = f12, f10, f10 (p6) fnma.s1 f12 = farg1, f11, farg0 ;; (p6) fma.s0 fret0 = f12, f10, f11 (p7) mov fret0 = f10 br.ret.sptk rp .endp __divxf3 #endif #ifdef L__divdf3 // Compute a 64-bit IEEE double quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // farg0 holds the dividend. farg1 holds the divisor. .text .align 16 .global __divdf3 .proc __divdf3 __divdf3: cmp.eq p7, p0 = r0, r0 frcpa.s0 f10, p6 = farg0, farg1 ;; (p6) cmp.ne p7, p0 = r0, r0 .pred.rel.mutex p6, p7 (p6) fmpy.s1 f11 = farg0, f10 (p6) fnma.s1 f12 = farg1, f10, f1 ;; (p6) fma.s1 f11 = f12, f11, f11 (p6) fmpy.s1 f13 = f12, f12 ;; (p6) fma.s1 f10 = f12, f10, f10 (p6) fma.s1 f11 = f13, f11, f11 ;; (p6) fmpy.s1 f12 = f13, f13 (p6) fma.s1 f10 = f13, f10, f10 ;; (p6) fma.d.s1 f11 = f12, f11, f11 (p6) fma.s1 f10 = f12, f10, f10 ;; (p6) fnma.d.s1 f8 = farg1, f11, farg0 ;; (p6) fma.d fret0 = f8, f10, f11 (p7) mov fret0 = f10 br.ret.sptk rp ;; .endp __divdf3 #endif #ifdef L__divsf3 // Compute a 32-bit IEEE float quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // farg0 holds the dividend. farg1 holds the divisor. .text .align 16 .global __divsf3 .proc __divsf3 __divsf3: cmp.eq p7, p0 = r0, r0 frcpa.s0 f10, p6 = farg0, farg1 ;; (p6) cmp.ne p7, p0 = r0, r0 .pred.rel.mutex p6, p7 (p6) fmpy.s1 f8 = farg0, f10 (p6) fnma.s1 f9 = farg1, f10, f1 ;; (p6) fma.s1 f8 = f9, f8, f8 (p6) fmpy.s1 f9 = f9, f9 ;; (p6) fma.s1 f8 = f9, f8, f8 (p6) fmpy.s1 f9 = f9, f9 ;; (p6) fma.d.s1 f10 = f9, f8, f8 ;; (p6) fnorm.s.s0 fret0 = f10 (p7) mov fret0 = f10 br.ret.sptk rp ;; .endp __divsf3 #endif #ifdef L__divdi3 // Compute a 64-bit integer quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend. in1 holds the divisor. .text .align 16 .global __divdi3 .proc __divdi3 __divdi3: .regstk 2,0,0,0 // Transfer inputs to FP registers. setf.sig f8 = in0 setf.sig f9 = in1 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 ;; // Convert the inputs to FP, so that they won't be treated as unsigned. fcvt.xf f8 = f8 fcvt.xf f9 = f9 (p7) break 1 ;; // Compute the reciprocal approximation. frcpa.s1 f10, p6 = f8, f9 ;; // 3 Newton-Raphson iterations. (p6) fnma.s1 f11 = f9, f10, f1 (p6) fmpy.s1 f12 = f8, f10 ;; (p6) fmpy.s1 f13 = f11, f11 (p6) fma.s1 f12 = f11, f12, f12 ;; (p6) fma.s1 f10 = f11, f10, f10 (p6) fma.s1 f11 = f13, f12, f12 ;; (p6) fma.s1 f10 = f13, f10, f10 (p6) fnma.s1 f12 = f9, f11, f8 ;; (p6) fma.s1 f10 = f12, f10, f11 ;; // Round quotient to an integer. fcvt.fx.trunc.s1 f10 = f10 ;; // Transfer result to GP registers. getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __divdi3 #endif #ifdef L__moddi3 // Compute a 64-bit integer modulus. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend (a). in1 holds the divisor (b). .text .align 16 .global __moddi3 .proc __moddi3 __moddi3: .regstk 2,0,0,0 // Transfer inputs to FP registers. setf.sig f14 = in0 setf.sig f9 = in1 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 ;; // Convert the inputs to FP, so that they won't be treated as unsigned. fcvt.xf f8 = f14 fcvt.xf f9 = f9 (p7) break 1 ;; // Compute the reciprocal approximation. frcpa.s1 f10, p6 = f8, f9 ;; // 3 Newton-Raphson iterations. (p6) fmpy.s1 f12 = f8, f10 (p6) fnma.s1 f11 = f9, f10, f1 ;; (p6) fma.s1 f12 = f11, f12, f12 (p6) fmpy.s1 f13 = f11, f11 ;; (p6) fma.s1 f10 = f11, f10, f10 (p6) fma.s1 f11 = f13, f12, f12 ;; sub in1 = r0, in1 (p6) fma.s1 f10 = f13, f10, f10 (p6) fnma.s1 f12 = f9, f11, f8 ;; setf.sig f9 = in1 (p6) fma.s1 f10 = f12, f10, f11 ;; fcvt.fx.trunc.s1 f10 = f10 ;; // r = q * (-b) + a xma.l f10 = f10, f9, f14 ;; // Transfer result to GP registers. getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __moddi3 #endif #ifdef L__udivdi3 // Compute a 64-bit unsigned integer quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend. in1 holds the divisor. .text .align 16 .global __udivdi3 .proc __udivdi3 __udivdi3: .regstk 2,0,0,0 // Transfer inputs to FP registers. setf.sig f8 = in0 setf.sig f9 = in1 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 ;; // Convert the inputs to FP, to avoid FP software-assist faults. fcvt.xuf.s1 f8 = f8 fcvt.xuf.s1 f9 = f9 (p7) break 1 ;; // Compute the reciprocal approximation. frcpa.s1 f10, p6 = f8, f9 ;; // 3 Newton-Raphson iterations. (p6) fnma.s1 f11 = f9, f10, f1 (p6) fmpy.s1 f12 = f8, f10 ;; (p6) fmpy.s1 f13 = f11, f11 (p6) fma.s1 f12 = f11, f12, f12 ;; (p6) fma.s1 f10 = f11, f10, f10 (p6) fma.s1 f11 = f13, f12, f12 ;; (p6) fma.s1 f10 = f13, f10, f10 (p6) fnma.s1 f12 = f9, f11, f8 ;; (p6) fma.s1 f10 = f12, f10, f11 ;; // Round quotient to an unsigned integer. fcvt.fxu.trunc.s1 f10 = f10 ;; // Transfer result to GP registers. getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __udivdi3 #endif #ifdef L__umoddi3 // Compute a 64-bit unsigned integer modulus. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend (a). in1 holds the divisor (b). .text .align 16 .global __umoddi3 .proc __umoddi3 __umoddi3: .regstk 2,0,0,0 // Transfer inputs to FP registers. setf.sig f14 = in0 setf.sig f9 = in1 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 ;; // Convert the inputs to FP, to avoid FP software assist faults. fcvt.xuf.s1 f8 = f14 fcvt.xuf.s1 f9 = f9 (p7) break 1; ;; // Compute the reciprocal approximation. frcpa.s1 f10, p6 = f8, f9 ;; // 3 Newton-Raphson iterations. (p6) fmpy.s1 f12 = f8, f10 (p6) fnma.s1 f11 = f9, f10, f1 ;; (p6) fma.s1 f12 = f11, f12, f12 (p6) fmpy.s1 f13 = f11, f11 ;; (p6) fma.s1 f10 = f11, f10, f10 (p6) fma.s1 f11 = f13, f12, f12 ;; sub in1 = r0, in1 (p6) fma.s1 f10 = f13, f10, f10 (p6) fnma.s1 f12 = f9, f11, f8 ;; setf.sig f9 = in1 (p6) fma.s1 f10 = f12, f10, f11 ;; // Round quotient to an unsigned integer. fcvt.fxu.trunc.s1 f10 = f10 ;; // r = q * (-b) + a xma.l f10 = f10, f9, f14 ;; // Transfer result to GP registers. getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __umoddi3 #endif #ifdef L__divsi3 // Compute a 32-bit integer quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend. in1 holds the divisor. .text .align 16 .global __divsi3 .proc __divsi3 __divsi3: .regstk 2,0,0,0 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 sxt4 in0 = in0 sxt4 in1 = in1 ;; setf.sig f8 = in0 setf.sig f9 = in1 (p7) break 1 ;; mov r2 = 0x0ffdd fcvt.xf f8 = f8 fcvt.xf f9 = f9 ;; setf.exp f11 = r2 frcpa.s1 f10, p6 = f8, f9 ;; (p6) fmpy.s1 f8 = f8, f10 (p6) fnma.s1 f9 = f9, f10, f1 ;; (p6) fma.s1 f8 = f9, f8, f8 (p6) fma.s1 f9 = f9, f9, f11 ;; (p6) fma.s1 f10 = f9, f8, f8 ;; fcvt.fx.trunc.s1 f10 = f10 ;; getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __divsi3 #endif #ifdef L__modsi3 // Compute a 32-bit integer modulus. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend. in1 holds the divisor. .text .align 16 .global __modsi3 .proc __modsi3 __modsi3: .regstk 2,0,0,0 mov r2 = 0x0ffdd sxt4 in0 = in0 sxt4 in1 = in1 ;; setf.sig f13 = r32 setf.sig f9 = r33 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 ;; sub in1 = r0, in1 fcvt.xf f8 = f13 fcvt.xf f9 = f9 ;; setf.exp f11 = r2 frcpa.s1 f10, p6 = f8, f9 (p7) break 1 ;; (p6) fmpy.s1 f12 = f8, f10 (p6) fnma.s1 f10 = f9, f10, f1 ;; setf.sig f9 = in1 (p6) fma.s1 f12 = f10, f12, f12 (p6) fma.s1 f10 = f10, f10, f11 ;; (p6) fma.s1 f10 = f10, f12, f12 ;; fcvt.fx.trunc.s1 f10 = f10 ;; xma.l f10 = f10, f9, f13 ;; getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __modsi3 #endif #ifdef L__udivsi3 // Compute a 32-bit unsigned integer quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend. in1 holds the divisor. .text .align 16 .global __udivsi3 .proc __udivsi3 __udivsi3: .regstk 2,0,0,0 mov r2 = 0x0ffdd zxt4 in0 = in0 zxt4 in1 = in1 ;; setf.sig f8 = in0 setf.sig f9 = in1 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 ;; fcvt.xf f8 = f8 fcvt.xf f9 = f9 (p7) break 1 ;; setf.exp f11 = r2 frcpa.s1 f10, p6 = f8, f9 ;; (p6) fmpy.s1 f8 = f8, f10 (p6) fnma.s1 f9 = f9, f10, f1 ;; (p6) fma.s1 f8 = f9, f8, f8 (p6) fma.s1 f9 = f9, f9, f11 ;; (p6) fma.s1 f10 = f9, f8, f8 ;; fcvt.fxu.trunc.s1 f10 = f10 ;; getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __udivsi3 #endif #ifdef L__umodsi3 // Compute a 32-bit unsigned integer modulus. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend. in1 holds the divisor. .text .align 16 .global __umodsi3 .proc __umodsi3 __umodsi3: .regstk 2,0,0,0 mov r2 = 0x0ffdd zxt4 in0 = in0 zxt4 in1 = in1 ;; setf.sig f13 = in0 setf.sig f9 = in1 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 ;; sub in1 = r0, in1 fcvt.xf f8 = f13 fcvt.xf f9 = f9 ;; setf.exp f11 = r2 frcpa.s1 f10, p6 = f8, f9 (p7) break 1; ;; (p6) fmpy.s1 f12 = f8, f10 (p6) fnma.s1 f10 = f9, f10, f1 ;; setf.sig f9 = in1 (p6) fma.s1 f12 = f10, f12, f12 (p6) fma.s1 f10 = f10, f10, f11 ;; (p6) fma.s1 f10 = f10, f12, f12 ;; fcvt.fxu.trunc.s1 f10 = f10 ;; xma.l f10 = f10, f9, f13 ;; getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __umodsi3 #endif #ifdef L__save_stack_nonlocal // Notes on save/restore stack nonlocal: We read ar.bsp but write // ar.bspstore. This is because ar.bsp can be read at all times // (independent of the RSE mode) but since it's read-only we need to // restore the value via ar.bspstore. This is OK because // ar.bsp==ar.bspstore after executing "flushrs". // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer) .text .align 16 .global __ia64_save_stack_nonlocal .proc __ia64_save_stack_nonlocal __ia64_save_stack_nonlocal: { .mmf alloc r18 = ar.pfs, 2, 0, 0, 0 mov r19 = ar.rsc ;; } { .mmi flushrs st8 [in0] = in1, 24 and r19 = 0x1c, r19 ;; } { .mmi st8 [in0] = r18, -16 mov ar.rsc = r19 or r19 = 0x3, r19 ;; } { .mmi mov r16 = ar.bsp mov r17 = ar.rnat adds r2 = 8, in0 ;; } { .mmi st8 [in0] = r16 st8 [r2] = r17 } { .mib mov ar.rsc = r19 br.ret.sptk.few rp ;; } .endp __ia64_save_stack_nonlocal #endif #ifdef L__nonlocal_goto // void __ia64_nonlocal_goto(void *target_label, void *save_area, // void *static_chain); .text .align 16 .global __ia64_nonlocal_goto .proc __ia64_nonlocal_goto __ia64_nonlocal_goto: { .mmi alloc r20 = ar.pfs, 3, 0, 0, 0 ld8 r12 = [in1], 8 mov.ret.sptk rp = in0, .L0 ;; } { .mmf ld8 r16 = [in1], 8 mov r19 = ar.rsc ;; } { .mmi flushrs ld8 r17 = [in1], 8 and r19 = 0x1c, r19 ;; } { .mmi ld8 r18 = [in1] mov ar.rsc = r19 or r19 = 0x3, r19 ;; } { .mmi mov ar.bspstore = r16 ;; mov ar.rnat = r17 ;; } { .mmi loadrs invala mov r15 = in2 ;; } .L0: { .mib mov ar.rsc = r19 mov ar.pfs = r18 br.ret.sptk.few rp ;; } .endp __ia64_nonlocal_goto #endif #ifdef L__restore_stack_nonlocal // This is mostly the same as nonlocal_goto above. // ??? This has not been tested yet. // void __ia64_restore_stack_nonlocal(void *save_area) .text .align 16 .global __ia64_restore_stack_nonlocal .proc __ia64_restore_stack_nonlocal __ia64_restore_stack_nonlocal: { .mmf alloc r20 = ar.pfs, 4, 0, 0, 0 ld8 r12 = [in0], 8 ;; } { .mmb ld8 r16=[in0], 8 mov r19 = ar.rsc ;; } { .mmi flushrs ld8 r17 = [in0], 8 and r19 = 0x1c, r19 ;; } { .mmf ld8 r18 = [in0] mov ar.rsc = r19 ;; } { .mmi mov ar.bspstore = r16 ;; mov ar.rnat = r17 or r19 = 0x3, r19 ;; } { .mmf loadrs invala ;; } .L0: { .mib mov ar.rsc = r19 mov ar.pfs = r18 br.ret.sptk.few rp ;; } .endp __ia64_restore_stack_nonlocal #endif #ifdef L__trampoline // Implement the nested function trampoline. This is out of line // so that we don't have to bother with flushing the icache, as // well as making the on-stack trampoline smaller. // // The trampoline has the following form: // // +-------------------+ > // TRAMP: | __ia64_trampoline | | // +-------------------+ > fake function descriptor // | TRAMP+16 | | // +-------------------+ > // | target descriptor | // +-------------------+ // | static link | // +-------------------+ .text .align 16 .global __ia64_trampoline .proc __ia64_trampoline __ia64_trampoline: { .mmi ld8 r2 = [r1], 8 ;; ld8 r15 = [r1] } { .mmi ld8 r3 = [r2], 8 ;; ld8 r1 = [r2] mov b6 = r3 } { .bbb br.sptk.many b6 ;; } .endp __ia64_trampoline #endif #ifdef SHARED // Thunks for backward compatibility. #ifdef L_fixtfdi .text .align 16 .global __fixtfti .proc __fixtfti __fixtfti: { .bbb br.sptk.many __fixxfti ;; } .endp __fixtfti #endif #ifdef L_fixunstfdi .align 16 .global __fixunstfti .proc __fixunstfti __fixunstfti: { .bbb br.sptk.many __fixunsxfti ;; } .endp __fixunstfti #endif #ifdef L_floatditf .align 16 .global __floattitf .proc __floattitf __floattitf: { .bbb br.sptk.many __floattixf ;; } .endp __floattitf #endif #endif