|  | /* Copyright 2002 Andi Kleen, SuSE Labs */ | 
|  |  | 
|  | #include <linux/linkage.h> | 
|  | #include <asm/cpufeatures.h> | 
|  | #include <asm/alternative-asm.h> | 
|  |  | 
|  | .weak memset | 
|  |  | 
|  | /* | 
|  | * ISO C memset - set a memory block to a byte value. This function uses fast | 
|  | * string to get better performance than the original function. The code is | 
|  | * simpler and shorter than the original function as well. | 
|  | * | 
|  | * rdi   destination | 
|  | * rsi   value (char) | 
|  | * rdx   count (bytes) | 
|  | * | 
|  | * rax   original destination | 
|  | */ | 
|  | ENTRY(memset) | 
|  | ENTRY(__memset) | 
|  | /* | 
|  | * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended | 
|  | * to use it when possible. If not available, use fast string instructions. | 
|  | * | 
|  | * Otherwise, use original memset function. | 
|  | */ | 
|  | ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ | 
|  | "jmp memset_erms", X86_FEATURE_ERMS | 
|  |  | 
|  | movq %rdi,%r9 | 
|  | movq %rdx,%rcx | 
|  | andl $7,%edx | 
|  | shrq $3,%rcx | 
|  | /* expand byte value  */ | 
|  | movzbl %sil,%esi | 
|  | movabs $0x0101010101010101,%rax | 
|  | imulq %rsi,%rax | 
|  | rep stosq | 
|  | movl %edx,%ecx | 
|  | rep stosb | 
|  | movq %r9,%rax | 
|  | ret | 
|  | ENDPROC(memset) | 
|  | ENDPROC(__memset) | 
|  |  | 
|  | /* | 
|  | * ISO C memset - set a memory block to a byte value. This function uses | 
|  | * enhanced rep stosb to override the fast string function. | 
|  | * The code is simpler and shorter than the fast string function as well. | 
|  | * | 
|  | * rdi   destination | 
|  | * rsi   value (char) | 
|  | * rdx   count (bytes) | 
|  | * | 
|  | * rax   original destination | 
|  | */ | 
|  | ENTRY(memset_erms) | 
|  | movq %rdi,%r9 | 
|  | movb %sil,%al | 
|  | movq %rdx,%rcx | 
|  | rep stosb | 
|  | movq %r9,%rax | 
|  | ret | 
|  | ENDPROC(memset_erms) | 
|  |  | 
|  | ENTRY(memset_orig) | 
|  | movq %rdi,%r10 | 
|  |  | 
|  | /* expand byte value  */ | 
|  | movzbl %sil,%ecx | 
|  | movabs $0x0101010101010101,%rax | 
|  | imulq  %rcx,%rax | 
|  |  | 
|  | /* align dst */ | 
|  | movl  %edi,%r9d | 
|  | andl  $7,%r9d | 
|  | jnz  .Lbad_alignment | 
|  | .Lafter_bad_alignment: | 
|  |  | 
|  | movq  %rdx,%rcx | 
|  | shrq  $6,%rcx | 
|  | jz	 .Lhandle_tail | 
|  |  | 
|  | .p2align 4 | 
|  | .Lloop_64: | 
|  | decq  %rcx | 
|  | movq  %rax,(%rdi) | 
|  | movq  %rax,8(%rdi) | 
|  | movq  %rax,16(%rdi) | 
|  | movq  %rax,24(%rdi) | 
|  | movq  %rax,32(%rdi) | 
|  | movq  %rax,40(%rdi) | 
|  | movq  %rax,48(%rdi) | 
|  | movq  %rax,56(%rdi) | 
|  | leaq  64(%rdi),%rdi | 
|  | jnz    .Lloop_64 | 
|  |  | 
|  | /* Handle tail in loops. The loops should be faster than hard | 
|  | to predict jump tables. */ | 
|  | .p2align 4 | 
|  | .Lhandle_tail: | 
|  | movl	%edx,%ecx | 
|  | andl    $63&(~7),%ecx | 
|  | jz 		.Lhandle_7 | 
|  | shrl	$3,%ecx | 
|  | .p2align 4 | 
|  | .Lloop_8: | 
|  | decl   %ecx | 
|  | movq  %rax,(%rdi) | 
|  | leaq  8(%rdi),%rdi | 
|  | jnz    .Lloop_8 | 
|  |  | 
|  | .Lhandle_7: | 
|  | andl	$7,%edx | 
|  | jz      .Lende | 
|  | .p2align 4 | 
|  | .Lloop_1: | 
|  | decl    %edx | 
|  | movb 	%al,(%rdi) | 
|  | leaq	1(%rdi),%rdi | 
|  | jnz     .Lloop_1 | 
|  |  | 
|  | .Lende: | 
|  | movq	%r10,%rax | 
|  | ret | 
|  |  | 
|  | .Lbad_alignment: | 
|  | cmpq $7,%rdx | 
|  | jbe	.Lhandle_7 | 
|  | movq %rax,(%rdi)	/* unaligned store */ | 
|  | movq $8,%r8 | 
|  | subq %r9,%r8 | 
|  | addq %r8,%rdi | 
|  | subq %r8,%rdx | 
|  | jmp .Lafter_bad_alignment | 
|  | .Lfinal: | 
|  | ENDPROC(memset_orig) |