酱坛子

专注C++技术在这里写下自己的学习心得感悟和大家讨论共同进步（欢迎批评！！！）

C++博客 :: 首页 :: 联系 :: 聚合

:: 管理

66 Posts :: 16 Stories :: 236 Comments :: 0 Trackbacks

公告

王一伟湖南商学院毕业电子信息工程专业

常用链接

留言簿(19)

我参与的团队

随笔档案(65)

文章分类(16)

文章档案(16)

相册

Blog List

技术网站

Boost中文站

我的Blog

我的生活blog

搜索

积分与排名

积分 - 388236
排名 - 64

阅读排行榜

评论排行榜

amd的Memcpy函数

void * memcpy_amd(void *dest, const void *src, size_t n)
{
__asm {

mov ecx, [n] ; number of bytes to copy
mov edi, [dest] ; destination
mov esi, [src] ; source
mov ebx, ecx ; keep a copy of count

cld
cmp ecx, TINY_BLOCK_COPY
jb $memcpy_ic_3 ; tiny? skip mmx copy

cmp ecx, 32*1024 ; don't align between 32k-64k because
jbe $memcpy_do_align ; it appears to be slower
cmp ecx, 64*1024
jbe $memcpy_align_done
$memcpy_do_align:
mov ecx, 8 ; a trick that's faster than rep movsb...
sub ecx, edi ; align destination to qword
and ecx, 111b ; get the low bits
sub ebx, ecx ; update copy count
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_align_done
jmp ecx ; jump to array of movsb's

align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb

$memcpy_align_done: ; destination is dword aligned
mov ecx, ebx ; number of bytes left to copy
shr ecx, 6 ; get 64-byte block count
jz $memcpy_ic_2 ; finish the last few bytes

cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
jae $memcpy_uc_test

// This is small block copy that uses the MMX registers to copy 8 bytes
// at a time. It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache.
align 16
$memcpy_ic_1: ; 64-byte block copies, in-cache copy

prefetchnta [esi + (200*64/34+192)] ; start reading ahead

movq mm0, [esi+0] ; read 64 bits
movq mm1, [esi+8]
movq [edi+0], mm0 ; write 64 bits
movq [edi+8], mm1 ;    note:  the normal movq writes the
movq mm2, [esi+16] ;    data to cache; a cache line will be
movq mm3, [esi+24] ;    allocated as needed, to store the data
movq [edi+16], mm2
movq [edi+24], mm3
movq mm0, [esi+32]
movq mm1, [esi+40]
movq [edi+32], mm0
movq [edi+40], mm1
movq mm2, [esi+48]
movq mm3, [esi+56]
movq [edi+48], mm2
movq [edi+56], mm3

add esi, 64 ; update source pointer
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memcpy_ic_1 ; last 64-byte block?

$memcpy_ic_2:
mov ecx, ebx ; has valid low 6 bits of the byte count
$memcpy_ic_3:
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_last_few
jmp ecx ; jump to array of movsd's

$memcpy_uc_test:
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
jae $memcpy_bp_1

$memcpy_64_test:
or ecx, ecx ; tail end of block prefetch will jump here
jz $memcpy_ic_2 ; no more 64-byte blocks left

// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
align 16
$memcpy_uc_1: ; 64-byte blocks, uncached copy

prefetchnta [esi + (200*64/34+192)] ; start reading ahead

movq mm0,[esi+0] ; read 64 bits
add edi,64 ; update destination pointer
movq mm1,[esi+8]
add esi,64 ; update source pointer
movq mm2,[esi-48]
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
movq mm0,[esi-40] ;    note: movntq also prevents the CPU
movntq [edi-56], mm1 ;    from READING the destination address
movq mm1,[esi-32] ;    into the cache, only to be over-written
movntq [edi-48], mm2 ;    so that also helps performance
movq mm2,[esi-24]
movntq [edi-40], mm0
movq mm0,[esi-16]
movntq [edi-32], mm1
movq mm1,[esi-8]
movntq [edi-24], mm2
movntq [edi-16], mm0
dec ecx
movntq [edi-8], mm1
jnz $memcpy_uc_1 ; last 64-byte block?

jmp $memcpy_ic_2 ; almost done

// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch, in this case.
// The technique is great for getting maximum read bandwidth,
// especially in DDR memory systems.
$memcpy_bp_1: ; large blocks, block prefetch copy

cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
jl $memcpy_64_test ; no, back to regular uncached copy

mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
add esi, CACHEBLOCK * 64 ; move to the top of the block
align 16
$memcpy_bp_2:
mov edx, [esi-64] ; grab one address per cache line
mov edx, [esi-128] ; grab one address per cache line
sub esi, 128 ; go reverse order
dec eax ; count down the cache lines
jnz $memcpy_bp_2 ; keep grabbing more lines into cache

mov eax, CACHEBLOCK ; now that it's in cache, do the copy
align 16
$memcpy_bp_3:
movq mm0, [esi   ] ; read 64 bits
movq mm1, [esi+ 8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
add esi, 64 ; update source pointer
movntq [edi   ], mm0 ; write 64 bits, bypassing cache
movntq [edi+ 8], mm1 ;    note: movntq also prevents the CPU
movntq [edi+16], mm2 ;    from READING the destination address
movntq [edi+24], mm3 ;    into the cache, only to be over-written,
movntq [edi+32], mm4 ;    so that also helps performance
movntq [edi+40], mm5
movntq [edi+48], mm6
movntq [edi+56], mm7
add edi, 64 ; update dest pointer

dec eax ; count down

jnz $memcpy_bp_3 ; keep copying
sub ecx, CACHEBLOCK ; update the 64-byte block count
jmp $memcpy_bp_1 ; keep processing chunks

// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop". Then it handles the last few bytes.
align 4
movsd
movsd ; perform last 1-15 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd ; perform last 1-7 dword copies
movsd
movsd
movsd
movsd
movsd
movsd

$memcpy_last_few: ; dword aligned from before movsd's
mov ecx, ebx ; has valid low 2 bits of the byte count
and ecx, 11b ; the last few cows must come home
jz $memcpy_final ; no more, let's leave
rep movsb ; the last 1, 2, or 3 bytes

$memcpy_final:
emms ; clean up the MMX state
sfence ; flush the write buffer
mov eax, [dest] ; ret value = destination pointer

}
}

posted on 2007-01-02 14:12 @王一伟阅读(1357) 评论(2) 编辑收藏引用

Feedback

# re: amd的Memcpy函数 2007-12-29 16:44 Solomon Joh

弓虽！全是asm啊，头大中…… 回复更多评论

# re: amd的Memcpy函数 2007-12-30 08:04 @王一伟

呵呵粘帖过来的还没来得及研究呢
有时间研究下可以探讨探讨

指令一般般多拉哈哈回复更多评论

刷新评论列表

只有注册用户登录后才能发表评论。
【推荐】100%开源！大型工业跨平台软件C++源码提供，建模，组态！



网站导航: 博客园 IT新闻 BlogJava 博问 Chat2DB 管理

酱坛子

公告