/* Subroutines used to expand string operations for RISC-V. Copyright (C) 2023-2024 Free Software Foundation, Inc. This file is part of GCC. GCC is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ #define IN_TARGET_CODE 1 #include "config.h" #include "system.h" #include "coretypes.h" #include "backend.h" #include "rtl.h" #include "tree.h" #include "memmodel.h" #include "tm_p.h" #include "ira.h" #include "print-tree.h" #include "varasm.h" #include "explow.h" #include "expr.h" #include "output.h" #include "target.h" #include "predict.h" #include "optabs.h" #include "riscv-protos.h" #include "recog.h" #include "tm-constrs.h" /* Emit proper instruction depending on mode of dest. */ #define GEN_EMIT_HELPER2(name) \ static rtx_insn * \ do_## name ## 2(rtx dest, rtx src) \ { \ rtx_insn *insn; \ if (GET_MODE (dest) == DImode) \ insn = emit_insn (gen_ ## name ## di2 (dest, src)); \ else \ insn = emit_insn (gen_ ## name ## si2 (dest, src)); \ return insn; \ } /* Emit proper instruction depending on mode of dest. */ #define GEN_EMIT_HELPER3(name) \ static rtx_insn * \ do_## name ## 3(rtx dest, rtx src1, rtx src2) \ { \ rtx_insn *insn; \ if (GET_MODE (dest) == DImode) \ insn = emit_insn (gen_ ## name ## di3 (dest, src1, src2)); \ else \ insn = emit_insn (gen_ ## name ## si3 (dest, src1, src2)); \ return insn; \ } GEN_EMIT_HELPER3(add) /* do_add3 */ GEN_EMIT_HELPER3(and) /* do_and3 */ GEN_EMIT_HELPER3(ashl) /* do_ashl3 */ GEN_EMIT_HELPER2(bswap) /* do_bswap2 */ GEN_EMIT_HELPER2(clz) /* do_clz2 */ GEN_EMIT_HELPER2(ctz) /* do_ctz2 */ GEN_EMIT_HELPER3(ior) /* do_ior3 */ GEN_EMIT_HELPER3(ior_not) /* do_ior_not3 */ GEN_EMIT_HELPER3(lshr) /* do_lshr3 */ GEN_EMIT_HELPER2(neg) /* do_neg2 */ GEN_EMIT_HELPER2(orcb) /* do_orcb2 */ GEN_EMIT_HELPER2(one_cmpl) /* do_one_cmpl2 */ GEN_EMIT_HELPER3(rotr) /* do_rotr3 */ GEN_EMIT_HELPER3(sub) /* do_sub3 */ GEN_EMIT_HELPER2(th_rev) /* do_th_rev2 */ GEN_EMIT_HELPER2(th_tstnbz) /* do_th_tstnbz2 */ GEN_EMIT_HELPER3(xor) /* do_xor3 */ GEN_EMIT_HELPER2(zero_extendqi) /* do_zero_extendqi2 */ #undef GEN_EMIT_HELPER2 #undef GEN_EMIT_HELPER3 /* Helper function to load a byte or a Pmode register. MODE is the mode to use for the load (QImode or Pmode). DEST is the destination register for the data. ADDR_REG is the register that holds the address. ADDR is the address expression to load from. This function returns an rtx containing the register, where the ADDR is stored. */ static rtx do_load_from_addr (machine_mode mode, rtx dest, rtx addr_reg, rtx addr) { rtx mem = gen_rtx_MEM (mode, addr_reg); MEM_COPY_ATTRIBUTES (mem, addr); set_mem_size (mem, GET_MODE_SIZE (mode)); if (mode == QImode) do_zero_extendqi2 (dest, mem); else if (mode == Xmode) emit_move_insn (dest, mem); else gcc_unreachable (); return addr_reg; } /* Generate a sequence to compare single characters in data1 and data2. RESULT is the register where the return value of str(n)cmp will be stored. DATA1 is a register which contains character1. DATA2 is a register which contains character2. FINAL_LABEL is the location after the calculation of the return value. */ static void emit_strcmp_scalar_compare_byte (rtx result, rtx data1, rtx data2, rtx final_label) { rtx tmp = gen_reg_rtx (Xmode); do_sub3 (tmp, data1, data2); emit_insn (gen_movsi (result, gen_lowpart (SImode, tmp))); emit_jump_insn (gen_jump (final_label)); emit_barrier (); /* No fall-through. */ } /* Generate a sequence to compare two strings in data1 and data2. DATA1 is a register which contains string1. DATA2 is a register which contains string2. ORC1 is a register where orc.b(data1) will be stored. CMP_BYTES is the length of the strings. END_LABEL is the location of the code that calculates the return value. */ static void emit_strcmp_scalar_compare_subword (rtx data1, rtx data2, rtx orc1, unsigned HOST_WIDE_INT cmp_bytes, rtx end_label) { /* Set a NUL-byte after the relevant data (behind the string). */ long long im = -256ll; rtx imask = gen_rtx_CONST_INT (Xmode, im); rtx m_reg = gen_reg_rtx (Xmode); emit_insn (gen_rtx_SET (m_reg, imask)); do_rotr3 (m_reg, m_reg, GEN_INT (64 - cmp_bytes * BITS_PER_UNIT)); do_and3 (data1, m_reg, data1); do_and3 (data2, m_reg, data2); if (TARGET_ZBB) do_orcb2 (orc1, data1); else do_th_tstnbz2 (orc1, data1); emit_jump_insn (gen_jump (end_label)); emit_barrier (); /* No fall-through. */ } /* Generate a sequence to compare two strings in data1 and data2. DATA1 is a register which contains string1. DATA2 is a register which contains string2. ORC1 is a register where orc.b(data1) will be stored. TESTVAL is the value to test ORC1 against. END_LABEL is the location of the code that calculates the return value. NONUL_END_LABEL is the location of the code that calculates the return value in case the first string does not contain a NULL-byte. */ static void emit_strcmp_scalar_compare_word (rtx data1, rtx data2, rtx orc1, rtx testval, rtx end_label, rtx nonul_end_label) { /* Check if data1 contains a NUL character. */ if (TARGET_ZBB) do_orcb2 (orc1, data1); else do_th_tstnbz2 (orc1, data1); rtx cond1 = gen_rtx_NE (VOIDmode, orc1, testval); emit_unlikely_jump_insn (gen_cbranch4 (Pmode, cond1, orc1, testval, end_label)); /* Break out if u1 != u2 */ rtx cond2 = gen_rtx_NE (VOIDmode, data1, data2); emit_unlikely_jump_insn (gen_cbranch4 (Pmode, cond2, data1, data2, nonul_end_label)); /* Fall-through on equality. */ } /* Generate the sequence of compares for strcmp/strncmp using zbb instructions. RESULT is the register where the return value of str(n)cmp will be stored. The strings are referenced by SRC1 and SRC2. The number of bytes to compare is defined by NBYTES. DATA1 is a register where string1 will be stored. DATA2 is a register where string2 will be stored. ORC1 is a register where orc.b(data1) will be stored. END_LABEL is the location of the code that calculates the return value. NONUL_END_LABEL is the location of the code that calculates the return value in case the first string does not contain a NULL-byte. FINAL_LABEL is the location of the code that comes after the calculation of the return value. */ static void emit_strcmp_scalar_load_and_compare (rtx result, rtx src1, rtx src2, unsigned HOST_WIDE_INT nbytes, rtx data1, rtx data2, rtx orc1, rtx end_label, rtx nonul_end_label, rtx final_label) { const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode); rtx src1_addr = force_reg (Pmode, XEXP (src1, 0)); rtx src2_addr = force_reg (Pmode, XEXP (src2, 0)); unsigned HOST_WIDE_INT offset = 0; rtx testval = gen_reg_rtx (Xmode); if (TARGET_ZBB) emit_insn (gen_rtx_SET (testval, constm1_rtx)); else emit_insn (gen_rtx_SET (testval, const0_rtx)); while (nbytes > 0) { unsigned HOST_WIDE_INT cmp_bytes = xlen < nbytes ? xlen : nbytes; machine_mode load_mode; if (cmp_bytes == 1) load_mode = QImode; else load_mode = Xmode; rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset)); do_load_from_addr (load_mode, data1, addr1, src1); rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset)); do_load_from_addr (load_mode, data2, addr2, src2); if (cmp_bytes == 1) { emit_strcmp_scalar_compare_byte (result, data1, data2, final_label); return; } else if (cmp_bytes < xlen) { emit_strcmp_scalar_compare_subword (data1, data2, orc1, cmp_bytes, end_label); return; } else emit_strcmp_scalar_compare_word (data1, data2, orc1, testval, end_label, nonul_end_label); offset += cmp_bytes; nbytes -= cmp_bytes; } } /* Fixup pointers and generate a call to strcmp. RESULT is the register where the return value of str(n)cmp will be stored. The strings are referenced by SRC1 and SRC2. The number of already compared bytes is defined by NBYTES. */ static void emit_strcmp_scalar_call_to_libc (rtx result, rtx src1, rtx src2, unsigned HOST_WIDE_INT nbytes) { /* Update pointers past what has been compared already. */ rtx src1_addr = force_reg (Pmode, XEXP (src1, 0)); rtx src2_addr = force_reg (Pmode, XEXP (src2, 0)); rtx src1_new = force_reg (Pmode, gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (nbytes))); rtx src2_new = force_reg (Pmode, gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (nbytes))); /* Construct call to strcmp to compare the rest of the string. */ tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); emit_library_call_value (XEXP (DECL_RTL (fun), 0), result, LCT_NORMAL, GET_MODE (result), src1_new, Pmode, src2_new, Pmode); } /* Fast strcmp-result calculation if no NULL-byte in string1. RESULT is the register where the return value of str(n)cmp will be stored. The mismatching strings are stored in DATA1 and DATA2. */ static void emit_strcmp_scalar_result_calculation_nonul (rtx result, rtx data1, rtx data2) { /* Words don't match, and no NUL byte in one word. Get bytes in big-endian order and compare as words. */ do_bswap2 (data1, data1); do_bswap2 (data2, data2); /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */ rtx tmp = gen_reg_rtx (Xmode); emit_insn (gen_slt_3 (LTU, Xmode, Xmode, tmp, data1, data2)); do_neg2 (tmp, tmp); do_ior3 (tmp, tmp, const1_rtx); emit_insn (gen_movsi (result, gen_lowpart (SImode, tmp))); } /* strcmp-result calculation. RESULT is the register where the return value of str(n)cmp will be stored. The strings are stored in DATA1 and DATA2. ORC1 contains orc.b(DATA1). */ static void emit_strcmp_scalar_result_calculation (rtx result, rtx data1, rtx data2, rtx orc1) { const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode); /* Convert non-equal bytes into non-NUL bytes. */ rtx diff = gen_reg_rtx (Xmode); do_xor3 (diff, data1, data2); rtx shift = gen_reg_rtx (Xmode); if (TARGET_ZBB) { /* Convert non-equal or NUL-bytes into non-NUL bytes. */ rtx syndrome = gen_reg_rtx (Xmode); do_orcb2 (diff, diff); do_ior_not3 (syndrome, orc1, diff); /* Count the number of equal bits from the beginning of the word. */ do_ctz2 (shift, syndrome); } else { /* Convert non-equal or NUL-bytes into non-NUL bytes. */ rtx syndrome = gen_reg_rtx (Xmode); do_th_tstnbz2 (diff, diff); do_one_cmpl2 (diff, diff); do_ior3 (syndrome, orc1, diff); /* Count the number of equal bits from the beginning of the word. */ do_th_rev2 (syndrome, syndrome); do_clz2 (shift, syndrome); } do_bswap2 (data1, data1); do_bswap2 (data2, data2); /* The most-significant-non-zero bit of the syndrome marks either the first bit that is different, or the top bit of the first zero byte. Shifting left now will bring the critical information into the top bits. */ do_ashl3 (data1, data1, gen_lowpart (QImode, shift)); do_ashl3 (data2, data2, gen_lowpart (QImode, shift)); /* But we need to zero-extend (char is unsigned) the value and then perform a signed 32-bit subtraction. */ unsigned int shiftr = (xlen - 1) * BITS_PER_UNIT; do_lshr3 (data1, data1, GEN_INT (shiftr)); do_lshr3 (data2, data2, GEN_INT (shiftr)); rtx tmp = gen_reg_rtx (Xmode); do_sub3 (tmp, data1, data2); emit_insn (gen_movsi (result, gen_lowpart (SImode, tmp))); } /* Expand str(n)cmp using Zbb/TheadBb instructions. The result will be stored in RESULT. The strings are referenced by SRC1 and SRC2. The number of bytes to compare is defined by NBYTES. The alignment is defined by ALIGNMENT. If NCOMPARE is false then libc's strcmp() will be called if comparing NBYTES of both strings did not find differences or NULL-bytes. Return true if expansion was successful, or false otherwise. */ static bool riscv_expand_strcmp_scalar (rtx result, rtx src1, rtx src2, unsigned HOST_WIDE_INT nbytes, unsigned HOST_WIDE_INT alignment, bool ncompare) { const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode); gcc_assert (TARGET_ZBB || TARGET_XTHEADBB); gcc_assert (nbytes > 0); gcc_assert ((int)nbytes <= riscv_strcmp_inline_limit); gcc_assert (ncompare || (nbytes & (xlen - 1)) == 0); /* Limit to 12-bits (maximum load-offset). */ if (nbytes > IMM_REACH) nbytes = IMM_REACH; /* We don't support big endian. */ if (BYTES_BIG_ENDIAN) return false; /* We need xlen-aligned strings. */ if (alignment < xlen) return false; /* Overall structure of emitted code: Load-and-compare: - Load data1 and data2 - Set orc1 := orc.b (data1) (or th.tstnbz) - Compare strings and either: - Fall-through on equality - Jump to nonul_end_label if data1 !or end_label - Calculate result value and jump to final_label // Fall-through Call-to-libc or set result to 0 (depending on ncompare) Jump to final_label nonul_end_label: // words don't match, and no null byte in first word. Calculate result value with the use of data1, data2 and orc1 Jump to final_label end_label: Calculate result value with the use of data1, data2 and orc1 Jump to final_label final_label: // Nothing. */ rtx data1 = gen_reg_rtx (Xmode); rtx data2 = gen_reg_rtx (Xmode); rtx orc1 = gen_reg_rtx (Xmode); rtx nonul_end_label = gen_label_rtx (); rtx end_label = gen_label_rtx (); rtx final_label = gen_label_rtx (); /* Generate a sequence of zbb instructions to compare out to the length specified. */ emit_strcmp_scalar_load_and_compare (result, src1, src2, nbytes, data1, data2, orc1, end_label, nonul_end_label, final_label); /* All compared and everything was equal. */ if (ncompare) { emit_insn (gen_rtx_SET (result, gen_rtx_CONST_INT (SImode, 0))); emit_jump_insn (gen_jump (final_label)); emit_barrier (); /* No fall-through. */ } else { emit_strcmp_scalar_call_to_libc (result, src1, src2, nbytes); emit_jump_insn (gen_jump (final_label)); emit_barrier (); /* No fall-through. */ } emit_label (nonul_end_label); emit_strcmp_scalar_result_calculation_nonul (result, data1, data2); emit_jump_insn (gen_jump (final_label)); emit_barrier (); /* No fall-through. */ emit_label (end_label); emit_strcmp_scalar_result_calculation (result, data1, data2, orc1); emit_jump_insn (gen_jump (final_label)); emit_barrier (); /* No fall-through. */ emit_label (final_label); return true; } /* Expand a string compare operation. The result will be stored in RESULT. The strings are referenced by SRC1 and SRC2. The argument BYTES_RTX either holds the number of characters to compare, or is NULL_RTX. The argument ALIGN_RTX holds the alignment. Return true if expansion was successful, or false otherwise. */ bool riscv_expand_strcmp (rtx result, rtx src1, rtx src2, rtx bytes_rtx, rtx align_rtx) { unsigned HOST_WIDE_INT compare_max; unsigned HOST_WIDE_INT nbytes; unsigned HOST_WIDE_INT alignment; bool ncompare = bytes_rtx != NULL_RTX; const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode); if (riscv_strcmp_inline_limit == 0) return false; /* Round down the comparision limit to a multiple of xlen. */ compare_max = riscv_strcmp_inline_limit & ~(xlen - 1); /* Decide how many bytes to compare inline. */ if (bytes_rtx == NULL_RTX) { nbytes = compare_max; } else { /* If we have a length, it must be constant. */ if (!CONST_INT_P (bytes_rtx)) return false; nbytes = UINTVAL (bytes_rtx); /* We don't emit parts of a strncmp() call. */ if (nbytes > compare_max) return false; } /* Guarantees: - nbytes > 0 - nbytes <= riscv_strcmp_inline_limit - nbytes is a multiple of xlen if !ncompare */ if (!CONST_INT_P (align_rtx)) return false; alignment = UINTVAL (align_rtx); if (TARGET_VECTOR && stringop_strategy & STRATEGY_VECTOR) { bool ok = riscv_vector::expand_strcmp (result, src1, src2, bytes_rtx, alignment, ncompare); if (ok) return true; } if ((TARGET_ZBB || TARGET_XTHEADBB) && stringop_strategy & STRATEGY_SCALAR) return riscv_expand_strcmp_scalar (result, src1, src2, nbytes, alignment, ncompare); return false; } /* If the provided string is aligned, then read XLEN bytes in a loop and use orc.b to find NUL-bytes. */ static bool riscv_expand_strlen_scalar (rtx result, rtx src, rtx align) { rtx testval, addr, addr_plus_regsz, word, zeros; rtx loop_label, cond; gcc_assert (TARGET_ZBB || TARGET_XTHEADBB); /* The alignment needs to be known and big enough. */ if (!CONST_INT_P (align) || UINTVAL (align) < GET_MODE_SIZE (Xmode)) return false; testval = gen_reg_rtx (Xmode); addr = copy_addr_to_reg (XEXP (src, 0)); addr_plus_regsz = gen_reg_rtx (Pmode); word = gen_reg_rtx (Xmode); zeros = gen_reg_rtx (Xmode); if (TARGET_ZBB) emit_insn (gen_rtx_SET (testval, constm1_rtx)); else emit_insn (gen_rtx_SET (testval, const0_rtx)); do_add3 (addr_plus_regsz, addr, GEN_INT (UNITS_PER_WORD)); loop_label = gen_label_rtx (); emit_label (loop_label); /* Load a word and use orc.b/th.tstnbz to find a zero-byte. */ do_load_from_addr (Xmode, word, addr, src); do_add3 (addr, addr, GEN_INT (UNITS_PER_WORD)); if (TARGET_ZBB) do_orcb2 (word, word); else do_th_tstnbz2 (word, word); cond = gen_rtx_EQ (VOIDmode, word, testval); emit_unlikely_jump_insn (gen_cbranch4 (Xmode, cond, word, testval, loop_label)); /* Calculate the return value by counting zero-bits. */ if (TARGET_ZBB) do_one_cmpl2 (word, word); if (TARGET_BIG_ENDIAN) do_clz2 (zeros, word); else if (TARGET_ZBB) do_ctz2 (zeros, word); else { do_th_rev2 (word, word); do_clz2 (zeros, word); } do_lshr3 (zeros, zeros, GEN_INT (exact_log2 (BITS_PER_UNIT))); do_add3 (addr, addr, zeros); do_sub3 (result, addr, addr_plus_regsz); return true; } /* Expand a strlen operation and return true if successful. Return false if we should let the compiler generate normal code, probably a strlen call. */ bool riscv_expand_strlen (rtx result, rtx src, rtx search_char, rtx align) { if (TARGET_VECTOR && stringop_strategy & STRATEGY_VECTOR) { riscv_vector::expand_rawmemchr (E_QImode, result, src, search_char, /* strlen */ true); return true; } gcc_assert (search_char == const0_rtx); if ((TARGET_ZBB || TARGET_XTHEADBB) && stringop_strategy & STRATEGY_SCALAR) return riscv_expand_strlen_scalar (result, src, align); return false; } /* Emit straight-line code to move LENGTH bytes from SRC to DEST. Assume that the areas do not overlap. */ static void riscv_block_move_straight (rtx dest, rtx src, unsigned HOST_WIDE_INT length) { unsigned HOST_WIDE_INT offset, delta; unsigned HOST_WIDE_INT bits; int i; enum machine_mode mode; rtx *regs; bits = MAX (BITS_PER_UNIT, MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest)))); mode = mode_for_size (bits, MODE_INT, 0).require (); delta = bits / BITS_PER_UNIT; /* Allocate a buffer for the temporary registers. */ regs = XALLOCAVEC (rtx, length / delta); /* Load as many BITS-sized chunks as possible. Use a normal load if the source has enough alignment, otherwise use left/right pairs. */ for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++) { regs[i] = gen_reg_rtx (mode); riscv_emit_move (regs[i], adjust_address (src, mode, offset)); } /* Copy the chunks to the destination. */ for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++) riscv_emit_move (adjust_address (dest, mode, offset), regs[i]); /* Mop up any left-over bytes. */ if (offset < length) { src = adjust_address (src, BLKmode, offset); dest = adjust_address (dest, BLKmode, offset); move_by_pieces (dest, src, length - offset, MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), RETURN_BEGIN); } } /* Helper function for doing a loop-based block operation on memory reference MEM. Each iteration of the loop will operate on LENGTH bytes of MEM. Create a new base register for use within the loop and point it to the start of MEM. Create a new memory reference that uses this register. Store them in *LOOP_REG and *LOOP_MEM respectively. */ static void riscv_adjust_block_mem (rtx mem, unsigned HOST_WIDE_INT length, rtx *loop_reg, rtx *loop_mem) { *loop_reg = copy_addr_to_reg (XEXP (mem, 0)); /* Although the new mem does not refer to a known location, it does keep up to LENGTH bytes of alignment. */ *loop_mem = change_address (mem, BLKmode, *loop_reg); set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT)); } /* Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER bytes at a time. LENGTH must be at least BYTES_PER_ITER. Assume that the memory regions do not overlap. */ static void riscv_block_move_loop (rtx dest, rtx src, unsigned HOST_WIDE_INT length, unsigned HOST_WIDE_INT bytes_per_iter) { rtx label, src_reg, dest_reg, final_src, test; unsigned HOST_WIDE_INT leftover; leftover = length % bytes_per_iter; length -= leftover; /* Create registers and memory references for use within the loop. */ riscv_adjust_block_mem (src, bytes_per_iter, &src_reg, &src); riscv_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest); /* Calculate the value that SRC_REG should have after the last iteration of the loop. */ final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length), 0, 0, OPTAB_WIDEN); /* Emit the start of the loop. */ label = gen_label_rtx (); emit_label (label); /* Emit the loop body. */ riscv_block_move_straight (dest, src, bytes_per_iter); /* Move on to the next block. */ riscv_emit_move (src_reg, plus_constant (Pmode, src_reg, bytes_per_iter)); riscv_emit_move (dest_reg, plus_constant (Pmode, dest_reg, bytes_per_iter)); /* Emit the loop condition. */ test = gen_rtx_NE (VOIDmode, src_reg, final_src); emit_jump_insn (gen_cbranch4 (Pmode, test, src_reg, final_src, label)); /* Mop up any left-over bytes. */ if (leftover) riscv_block_move_straight (dest, src, leftover); else emit_insn(gen_nop ()); } /* Expand a cpymemsi instruction, which copies LENGTH bytes from memory reference SRC to memory reference DEST. */ static bool riscv_expand_block_move_scalar (rtx dest, rtx src, rtx length) { if (!CONST_INT_P (length)) return false; unsigned HOST_WIDE_INT hwi_length = UINTVAL (length); unsigned HOST_WIDE_INT factor, align; align = MIN (MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), BITS_PER_WORD); factor = BITS_PER_WORD / align; if (optimize_function_for_size_p (cfun) && hwi_length * factor * UNITS_PER_WORD > MOVE_RATIO (false)) return false; if (hwi_length <= (RISCV_MAX_MOVE_BYTES_STRAIGHT / factor)) { riscv_block_move_straight (dest, src, INTVAL (length)); return true; } else if (optimize && align >= BITS_PER_WORD) { unsigned min_iter_words = RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD; unsigned iter_words = min_iter_words; unsigned HOST_WIDE_INT bytes = hwi_length; unsigned HOST_WIDE_INT words = bytes / UNITS_PER_WORD; /* Lengthen the loop body if it shortens the tail. */ for (unsigned i = min_iter_words; i < min_iter_words * 2 - 1; i++) { unsigned cur_cost = iter_words + words % iter_words; unsigned new_cost = i + words % i; if (new_cost <= cur_cost) iter_words = i; } riscv_block_move_loop (dest, src, bytes, iter_words * UNITS_PER_WORD); return true; } return false; } /* This function delegates block-move expansion to either the vector implementation or the scalar one. Return TRUE if successful or FALSE otherwise. */ bool riscv_expand_block_move (rtx dest, rtx src, rtx length) { if ((TARGET_VECTOR && !TARGET_XTHEADVECTOR) && stringop_strategy & STRATEGY_VECTOR) { bool ok = riscv_vector::expand_block_move (dest, src, length); if (ok) return true; } if (stringop_strategy & STRATEGY_SCALAR) return riscv_expand_block_move_scalar (dest, src, length); return false; } /* --- Vector expanders --- */ namespace riscv_vector { /* Used by cpymemsi in riscv.md . */ bool expand_block_move (rtx dst_in, rtx src_in, rtx length_in) { /* memcpy: mv a3, a0 # Copy destination loop: vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b vle8.v v0, (a1) # Load bytes add a1, a1, t0 # Bump pointer sub a2, a2, t0 # Decrement count vse8.v v0, (a3) # Store bytes add a3, a3, t0 # Bump pointer bnez a2, loop # Any more? ret # Return */ gcc_assert (TARGET_VECTOR); HOST_WIDE_INT potential_ew = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD) / BITS_PER_UNIT); machine_mode vmode = VOIDmode; bool need_loop = true; bool size_p = optimize_function_for_size_p (cfun); rtx src, dst; rtx end = gen_reg_rtx (Pmode); rtx vec; rtx length_rtx = length_in; if (CONST_INT_P (length_in)) { HOST_WIDE_INT length = INTVAL (length_in); /* By using LMUL=8, we can copy as many bytes in one go as there are bits in a vector register. If the entire block thus fits, we don't need a loop. */ if (length <= TARGET_MIN_VLEN) { need_loop = false; /* If a single scalar load / store pair can do the job, leave it to the scalar code to do that. */ /* ??? If fast unaligned access is supported, the scalar code could use suitably sized scalars irrespective of alignemnt. If that gets fixed, we have to adjust the test here. */ if (pow2p_hwi (length) && length <= potential_ew) return false; } /* Find the vector mode to use. Using the largest possible element size is likely to give smaller constants, and thus potentially reducing code size. However, if we need a loop, we need to update the pointers, and that is more complicated with a larger element size, unless we use an immediate, which prevents us from dynamically using the targets transfer size that the hart supports. And then, unless we know the *exact* vector size of the hart, we'd need multiple vsetvli / branch statements, so it's not even a size win. If, in the future, we find an RISCV-V implementation that is slower for small element widths, we might allow larger element widths for loops too. */ if (need_loop) potential_ew = 1; for (; potential_ew; potential_ew >>= 1) { scalar_int_mode elem_mode; unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT; unsigned HOST_WIDE_INT per_iter; HOST_WIDE_INT nunits; if (need_loop) per_iter = TARGET_MIN_VLEN; else per_iter = length; nunits = per_iter / potential_ew; /* Unless we get an implementation that's slow for small element size / non-word-aligned accesses, we assume that the hardware handles this well, and we don't want to complicate the code with shifting word contents around or handling extra bytes at the start and/or end. So we want the total transfer size and alignment to fit with the element size. */ if (length % potential_ew != 0 || !int_mode_for_size (bits, 0).exists (&elem_mode)) continue; /* Find the mode to use for the copy inside the loop - or the sole copy, if there is no loop. */ if (!need_loop) { /* Try if we have an exact mode for the copy. */ if (riscv_vector::get_vector_mode (elem_mode, nunits).exists (&vmode)) break; /* Since we don't have a mode that exactlty matches the transfer size, we'll need to use pred_store, which is not available for all vector modes, but only iE_RVV_M* modes, hence trying to find a vector mode for a merely rounded-up size is pointless. Still, by choosing a lower LMUL factor that still allows an entire transfer, we can reduce register pressure. */ for (unsigned lmul = 1; lmul <= 4; lmul <<= 1) if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT /* Avoid loosing the option of using vsetivli . */ && (nunits <= 31 * lmul || nunits > 31 * 8) && multiple_p (BYTES_PER_RISCV_VECTOR * lmul, potential_ew) && (riscv_vector::get_vector_mode (elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * lmul, potential_ew)).exists (&vmode))) break; } /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes wide. BYTES_PER_RISCV_VECTOR can't be eavenly divided by the sizes of larger element types; the LMUL factor of 8 can at the moment be divided by the SEW, with SEW of up to 8 bytes, but there are reserved encodings so there might be larger SEW in the future. */ if (riscv_vector::get_vector_mode (elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * 8, potential_ew)).exists (&vmode)) break; /* We may get here if we tried an element size that's larger than the hardware supports, but we should at least find a suitable byte vector mode. */ gcc_assert (potential_ew > 1); } if (potential_ew > 1) length_rtx = GEN_INT (length / potential_ew); } else { vmode = E_RVVM8QImode; } /* A memcpy libcall in the worst case takes 3 instructions to prepare the arguments + 1 for the call. When RVV should take 7 instructions and we're optimizing for size a libcall may be preferable. */ if (size_p && need_loop) return false; /* length_rtx holds the (remaining) length of the required copy. cnt holds the length we copy with the current load/store pair. */ rtx cnt = length_rtx; rtx label = NULL_RTX; rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0)); rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0)); if (need_loop) { length_rtx = copy_to_mode_reg (Pmode, length_rtx); cnt = gen_reg_rtx (Pmode); label = gen_label_rtx (); emit_label (label); emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (vmode, cnt, length_rtx)); } vec = gen_reg_rtx (vmode); src = change_address (src_in, vmode, src_addr); dst = change_address (dst_in, vmode, dst_addr); /* If we don't need a loop and have a suitable mode to describe the size, just do a load / store pair and leave it up to the later lazy code motion pass to insert the appropriate vsetvli. */ if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in))) { emit_move_insn (vec, src); emit_move_insn (dst, vec); } else { machine_mode mask_mode = riscv_vector::get_vector_mode (BImode, GET_MODE_NUNITS (vmode)).require (); rtx mask = CONSTM1_RTX (mask_mode); if (!satisfies_constraint_K (cnt)) cnt= force_reg (Pmode, cnt); rtx m_ops[] = {vec, mask, src}; emit_nonvlmax_insn (code_for_pred_mov (vmode), riscv_vector::UNARY_OP_TAMA, m_ops, cnt); emit_insn (gen_pred_store (vmode, dst, mask, vec, cnt, get_avl_type_rtx (riscv_vector::NONVLMAX))); } if (need_loop) { emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt))); emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt))); emit_insn (gen_rtx_SET (length_rtx, gen_rtx_MINUS (Pmode, length_rtx, cnt))); /* Emit the loop condition. */ rtx test = gen_rtx_NE (VOIDmode, end, const0_rtx); emit_jump_insn (gen_cbranch4 (Pmode, test, length_rtx, const0_rtx, label)); emit_insn (gen_nop ()); } return true; } /* Implement rawmemchr and strlen using vector instructions. It can be assumed that the needle is in the haystack, otherwise the behavior is undefined. */ void expand_rawmemchr (machine_mode mode, rtx dst, rtx haystack, rtx needle, bool strlen) { /* rawmemchr: loop: vsetvli a1, zero, e[8,16,32,64], m1, ta, ma vle[8,16,32,64]ff.v v8, (a0) # Load. csrr a1, vl # Get number of bytes read. vmseq.vx v0, v8, pat # v0 = (v8 == {pat, pat, ...}) vfirst.m a2, v0 # Find first hit. add a0, a0, a1 # Bump pointer. bltz a2, loop # Not found? sub a0, a0, a1 # Go back by a1. shll a2, a2, [0,1,2,3] # Shift to get byte offset. add a0, a0, a2 # Add the offset. ret */ gcc_assert (TARGET_VECTOR); if (strlen) gcc_assert (mode == E_QImode); unsigned int isize = GET_MODE_SIZE (mode).to_constant (); int lmul = TARGET_MAX_LMUL; poly_int64 nunits = exact_div (BYTES_PER_RISCV_VECTOR * lmul, isize); machine_mode vmode; if (!riscv_vector::get_vector_mode (GET_MODE_INNER (mode), nunits).exists (&vmode)) gcc_unreachable (); machine_mode mask_mode = riscv_vector::get_mask_mode (vmode); rtx cnt = gen_reg_rtx (Pmode); emit_move_insn (cnt, CONST0_RTX (Pmode)); rtx end = gen_reg_rtx (Pmode); rtx vec = gen_reg_rtx (vmode); rtx mask = gen_reg_rtx (mask_mode); /* After finding the first vector element matching the needle, we need to multiply by the vector element width (SEW) in order to return a pointer to the matching byte. */ unsigned int shift = exact_log2 (GET_MODE_SIZE (mode).to_constant ()); rtx src_addr = copy_addr_to_reg (XEXP (haystack, 0)); rtx start_addr = copy_addr_to_reg (XEXP (haystack, 0)); rtx loop = gen_label_rtx (); emit_label (loop); rtx vsrc = change_address (haystack, vmode, src_addr); /* Bump the pointer. */ rtx step = gen_reg_rtx (Pmode); emit_insn (gen_rtx_SET (step, gen_rtx_ASHIFT (Pmode, cnt, GEN_INT (shift)))); emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, step))); /* Emit a first-fault load. */ rtx vlops[] = {vec, vsrc}; emit_vlmax_insn (code_for_pred_fault_load (vmode), riscv_vector::UNARY_OP, vlops); /* Read how far we read. */ if (Pmode == SImode) emit_insn (gen_read_vlsi (cnt)); else emit_insn (gen_read_vldi_zero_extend (cnt)); /* Compare needle with haystack and store in a mask. */ rtx eq = gen_rtx_EQ (mask_mode, gen_const_vec_duplicate (vmode, needle), vec); rtx vmsops[] = {mask, eq, vec, needle}; emit_nonvlmax_insn (code_for_pred_eqne_scalar (vmode), riscv_vector::COMPARE_OP, vmsops, cnt); /* Find the first bit in the mask. */ rtx vfops[] = {end, mask}; emit_nonvlmax_insn (code_for_pred_ffs (mask_mode, Pmode), riscv_vector::CPOP_OP, vfops, cnt); /* Emit the loop condition. */ rtx test = gen_rtx_LT (VOIDmode, end, const0_rtx); emit_jump_insn (gen_cbranch4 (Pmode, test, end, const0_rtx, loop)); if (strlen) { /* For strlen, return the length. */ emit_insn (gen_rtx_SET (dst, gen_rtx_PLUS (Pmode, src_addr, end))); emit_insn (gen_rtx_SET (dst, gen_rtx_MINUS (Pmode, dst, start_addr))); } else { /* For rawmemchr, return the position at SRC + END * [1,2,4,8]. */ emit_insn (gen_rtx_SET (end, gen_rtx_ASHIFT (Pmode, end, GEN_INT (shift)))); emit_insn (gen_rtx_SET (dst, gen_rtx_PLUS (Pmode, src_addr, end))); } } /* Implement cmpstr using vector instructions. The ALIGNMENT and NCOMPARE parameters are unused for now. */ bool expand_strcmp (rtx result, rtx src1, rtx src2, rtx nbytes, unsigned HOST_WIDE_INT, bool) { gcc_assert (TARGET_VECTOR); /* We don't support big endian. */ if (BYTES_BIG_ENDIAN) return false; bool with_length = nbytes != NULL_RTX; if (with_length && (!REG_P (nbytes) && !SUBREG_P (nbytes) && !CONST_INT_P (nbytes))) return false; if (with_length && CONST_INT_P (nbytes)) nbytes = force_reg (Pmode, nbytes); machine_mode mode = E_QImode; unsigned int isize = GET_MODE_SIZE (mode).to_constant (); int lmul = TARGET_MAX_LMUL; poly_int64 nunits = exact_div (BYTES_PER_RISCV_VECTOR * lmul, isize); machine_mode vmode; if (!riscv_vector::get_vector_mode (GET_MODE_INNER (mode), nunits) .exists (&vmode)) gcc_unreachable (); machine_mode mask_mode = riscv_vector::get_mask_mode (vmode); /* Prepare addresses. */ rtx src_addr1 = copy_addr_to_reg (XEXP (src1, 0)); rtx vsrc1 = change_address (src1, vmode, src_addr1); rtx src_addr2 = copy_addr_to_reg (XEXP (src2, 0)); rtx vsrc2 = change_address (src2, vmode, src_addr2); /* Set initial pointer bump to 0. */ rtx cnt = gen_reg_rtx (Pmode); emit_move_insn (cnt, CONST0_RTX (Pmode)); rtx sub = gen_reg_rtx (Pmode); emit_move_insn (sub, CONST0_RTX (Pmode)); /* Create source vectors. */ rtx vec1 = gen_reg_rtx (vmode); rtx vec2 = gen_reg_rtx (vmode); rtx done = gen_label_rtx (); rtx loop = gen_label_rtx (); emit_label (loop); /* Bump the pointers. */ emit_insn (gen_rtx_SET (src_addr1, gen_rtx_PLUS (Pmode, src_addr1, cnt))); emit_insn (gen_rtx_SET (src_addr2, gen_rtx_PLUS (Pmode, src_addr2, cnt))); rtx vlops1[] = {vec1, vsrc1}; rtx vlops2[] = {vec2, vsrc2}; if (!with_length) { emit_vlmax_insn (code_for_pred_fault_load (vmode), riscv_vector::UNARY_OP, vlops1); emit_vlmax_insn (code_for_pred_fault_load (vmode), riscv_vector::UNARY_OP, vlops2); } else { nbytes = gen_lowpart (Pmode, nbytes); emit_nonvlmax_insn (code_for_pred_fault_load (vmode), riscv_vector::UNARY_OP, vlops1, nbytes); emit_nonvlmax_insn (code_for_pred_fault_load (vmode), riscv_vector::UNARY_OP, vlops2, nbytes); } /* Read the vl for the next pointer bump. */ if (Pmode == SImode) emit_insn (gen_read_vlsi (cnt)); else emit_insn (gen_read_vldi_zero_extend (cnt)); if (with_length) { rtx test_done = gen_rtx_EQ (VOIDmode, cnt, const0_rtx); emit_jump_insn (gen_cbranch4 (Pmode, test_done, cnt, const0_rtx, done)); emit_insn (gen_rtx_SET (nbytes, gen_rtx_MINUS (Pmode, nbytes, cnt))); } /* Look for a \0 in the first string. */ rtx mask0 = gen_reg_rtx (mask_mode); rtx eq0 = gen_rtx_EQ (mask_mode, gen_const_vec_duplicate (vmode, CONST0_RTX (mode)), vec1); rtx vmsops1[] = {mask0, eq0, vec1, CONST0_RTX (mode)}; emit_nonvlmax_insn (code_for_pred_eqne_scalar (vmode), riscv_vector::COMPARE_OP, vmsops1, cnt); /* Look for vec1 != vec2 (includes vec2[i] == 0). */ rtx maskne = gen_reg_rtx (mask_mode); rtx ne = gen_rtx_NE (mask_mode, vec1, vec2); rtx vmsops[] = {maskne, ne, vec1, vec2}; emit_nonvlmax_insn (code_for_pred_cmp (vmode), riscv_vector::COMPARE_OP, vmsops, cnt); /* Combine both masks into one. */ rtx mask = gen_reg_rtx (mask_mode); rtx vmorops[] = {mask, mask0, maskne}; emit_nonvlmax_insn (code_for_pred (IOR, mask_mode), riscv_vector::BINARY_MASK_OP, vmorops, cnt); /* Find the first bit in the mask (the first unequal element). */ rtx found_at = gen_reg_rtx (Pmode); rtx vfops[] = {found_at, mask}; emit_nonvlmax_insn (code_for_pred_ffs (mask_mode, Pmode), riscv_vector::CPOP_OP, vfops, cnt); /* Emit the loop condition. */ rtx test = gen_rtx_LT (VOIDmode, found_at, const0_rtx); emit_jump_insn (gen_cbranch4 (Pmode, test, found_at, const0_rtx, loop)); /* Walk up to the difference point. */ emit_insn ( gen_rtx_SET (src_addr1, gen_rtx_PLUS (Pmode, src_addr1, found_at))); emit_insn ( gen_rtx_SET (src_addr2, gen_rtx_PLUS (Pmode, src_addr2, found_at))); /* Load the respective byte and compute the difference. */ rtx c1 = gen_reg_rtx (Pmode); rtx c2 = gen_reg_rtx (Pmode); do_load_from_addr (mode, c1, src_addr1, src1); do_load_from_addr (mode, c2, src_addr2, src2); do_sub3 (sub, c1, c2); if (with_length) emit_label (done); emit_insn (gen_movsi (result, gen_lowpart (SImode, sub))); return true; } }