#include "../binary_c.h"

#if defined USE_SKYWIND_FAST_MEMCPY
#ifndef __HAVE_AVX__

//=====================================================================
//
// FastMemcpy.c - skywind3000@163.com, 2015
//
// feature:
// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9)
//
//=====================================================================

/*
The MIT License (MIT)

Copyright (c) 2015 Linwei

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#if (defined(_WIN32) || defined(WIN32))
#include <windows.h>
#include <mmsystem.h>
#ifdef _MSC_VER
#pragma comment(lib, "winmm.lib")
#endif
#elif defined(__unix)
#include <sys/time.h>
#include <unistd.h>
#else
#error it can only be compiled under windows or unix
#endif



//---------------------------------------------------------------------
// main routine
//---------------------------------------------------------------------
void* memcpy_fast(void *destination, const void *source, size_t size)
{
    unsigned char *dst = (unsigned char*)destination;
	const unsigned char *src = (const unsigned char*)source;
	static size_t cachesize = 0x200000; // L2-cache size
	size_t padding;

	// small memory copy
	if (size <= 128) {
		return memcpy_tiny(dst, src, size);
	}

	// align destination to 16 bytes boundary
	padding = (16 - (((size_t)dst) & 15)) & 15;

	if (padding > 0) {
		__m128i head = _mm_loadu_si128((const __m128i*)src);
		_mm_storeu_si128((__m128i*)dst, head);
		dst += padding;
		src += padding;
		size -= padding;
	}

	// medium size copy
	if (size <= cachesize) {
		__m128i c0, c1, c2, c3, c4, c5, c6, c7;

		for (; size >= 128; size -= 128) {
			c0 = _mm_loadu_si128(((const __m128i*)src) + 0);
			c1 = _mm_loadu_si128(((const __m128i*)src) + 1);
			c2 = _mm_loadu_si128(((const __m128i*)src) + 2);
			c3 = _mm_loadu_si128(((const __m128i*)src) + 3);
			c4 = _mm_loadu_si128(((const __m128i*)src) + 4);
			c5 = _mm_loadu_si128(((const __m128i*)src) + 5);
			c6 = _mm_loadu_si128(((const __m128i*)src) + 6);
			c7 = _mm_loadu_si128(((const __m128i*)src) + 7);
			_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
			src += 128;
			_mm_store_si128((((__m128i*)dst) + 0), c0);
			_mm_store_si128((((__m128i*)dst) + 1), c1);
			_mm_store_si128((((__m128i*)dst) + 2), c2);
			_mm_store_si128((((__m128i*)dst) + 3), c3);
			_mm_store_si128((((__m128i*)dst) + 4), c4);
			_mm_store_si128((((__m128i*)dst) + 5), c5);
			_mm_store_si128((((__m128i*)dst) + 6), c6);
			_mm_store_si128((((__m128i*)dst) + 7), c7);
			dst += 128;
		}
	}
	else {		// big memory copy
		__m128i c0, c1, c2, c3, c4, c5, c6, c7;

		_mm_prefetch((const char*)(src), _MM_HINT_NTA);

		if ((((size_t)src) & 15) == 0) {	// source aligned
			for (; size >= 128; size -= 128) {
				c0 = _mm_load_si128(((const __m128i*)src) + 0);
				c1 = _mm_load_si128(((const __m128i*)src) + 1);
				c2 = _mm_load_si128(((const __m128i*)src) + 2);
				c3 = _mm_load_si128(((const __m128i*)src) + 3);
				c4 = _mm_load_si128(((const __m128i*)src) + 4);
				c5 = _mm_load_si128(((const __m128i*)src) + 5);
				c6 = _mm_load_si128(((const __m128i*)src) + 6);
				c7 = _mm_load_si128(((const __m128i*)src) + 7);
				_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
				src += 128;
				_mm_stream_si128((((__m128i*)dst) + 0), c0);
				_mm_stream_si128((((__m128i*)dst) + 1), c1);
				_mm_stream_si128((((__m128i*)dst) + 2), c2);
				_mm_stream_si128((((__m128i*)dst) + 3), c3);
				_mm_stream_si128((((__m128i*)dst) + 4), c4);
				_mm_stream_si128((((__m128i*)dst) + 5), c5);
				_mm_stream_si128((((__m128i*)dst) + 6), c6);
				_mm_stream_si128((((__m128i*)dst) + 7), c7);
				dst += 128;
			}
		}
		else {							// source unaligned
			for (; size >= 128; size -= 128) {
				c0 = _mm_loadu_si128(((const __m128i*)src) + 0);
				c1 = _mm_loadu_si128(((const __m128i*)src) + 1);
				c2 = _mm_loadu_si128(((const __m128i*)src) + 2);
				c3 = _mm_loadu_si128(((const __m128i*)src) + 3);
				c4 = _mm_loadu_si128(((const __m128i*)src) + 4);
				c5 = _mm_loadu_si128(((const __m128i*)src) + 5);
				c6 = _mm_loadu_si128(((const __m128i*)src) + 6);
				c7 = _mm_loadu_si128(((const __m128i*)src) + 7);
				_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
				src += 128;
				_mm_stream_si128((((__m128i*)dst) + 0), c0);
				_mm_stream_si128((((__m128i*)dst) + 1), c1);
				_mm_stream_si128((((__m128i*)dst) + 2), c2);
				_mm_stream_si128((((__m128i*)dst) + 3), c3);
				_mm_stream_si128((((__m128i*)dst) + 4), c4);
				_mm_stream_si128((((__m128i*)dst) + 5), c5);
				_mm_stream_si128((((__m128i*)dst) + 6), c6);
				_mm_stream_si128((((__m128i*)dst) + 7), c7);
				dst += 128;
			}
		}
		_mm_sfence();
	}

	memcpy_tiny(dst, src, size);

	return destination;
}




#ifdef __MEMCPY_TESTING_CODE

#include "FastMemcpy.h"
unsigned int gettime(void)
{
	#if (defined(_WIN32) || defined(WIN32))
	return timeGetTime();
	#else
	static struct timezone tz={ 0,0 };
	struct timeval time;
	gettimeofday(&time,&tz);
	return (time.tv_sec * 1000 + time.tv_usec / 1000);
	#endif
}

void sleepms(unsigned int millisec)
{
#if defined(_WIN32) || defined(WIN32)
	Sleep(millisec);
#else
	usleep(millisec * 1000);
#endif
}


void benchmark(int dstalign, int srcalign, size_t size, int times)
{
	char *DATA1 = (char*)malloc(size + 64);
	char *DATA2 = (char*)malloc(size + 64);
	size_t LINEAR1 = ((size_t)DATA1);
	size_t LINEAR2 = ((size_t)DATA2);
	char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1);
	char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2);
	char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1);
	char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3);
	unsigned int t1, t2;
	int k;
	
	sleepms(100);
	t1 = gettime();
	for (k = times; k > 0; k--) {
		memcpy(dst, src, size);
	}
	t1 = gettime() - t1;
	sleepms(100);
	t2 = gettime();
	for (k = times; k > 0; k--) {
		memcpy_fast(dst, src, size);
	}
	t2 = gettime() - t2;

	free(DATA1);
	free(DATA2);

	printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n",  
		dstalign? "aligned" : "unalign", 
		srcalign? "aligned" : "unalign", (int)t2, (int)t1);
}


void bench(int copysize, int times)
{
	printf("benchmark(size=%d bytes, times=%d):\n", copysize, times);
	benchmark(1, 1, copysize, times);
	benchmark(1, 0, copysize, times);
	benchmark(0, 1, copysize, times);
	benchmark(0, 0, copysize, times);
	printf("\n");
}


void random_bench(int maxsize, int times)
{
	static char A[11 * 1024 * 1024 + 2];
	static char B[11 * 1024 * 1024 + 2];
	static int random_offsets[0x10000];
	static int random_sizes[0x8000];
	unsigned int i, p1, p2;
	unsigned int t1, t2;
	for (i = 0; i < 0x10000; i++) {	// generate random offsets
		random_offsets[i] = rand() % (10 * 1024 * 1024 + 1);
	}
	for (i = 0; i < 0x8000; i++) {	// generate random sizes
		random_sizes[i] = 1 + rand() % maxsize;
	}
	sleepms(100);
	t1 = gettime();
	for (p1 = 0, p2 = 0, i = 0; i < times; i++) {
		int offset1 = random_offsets[(p1++) & 0xffff];
		int offset2 = random_offsets[(p1++) & 0xffff];
		int size = random_sizes[(p2++) & 0x7fff];
		memcpy(A + offset1, B + offset2, size);
	}
	t1 = gettime() - t1;
	sleepms(100);
	t2 = gettime();
	for (p1 = 0, p2 = 0, i = 0; i < times; i++) {
		int offset1 = random_offsets[(p1++) & 0xffff];
		int offset2 = random_offsets[(p1++) & 0xffff];
		int size = random_sizes[(p2++) & 0x7fff];
		memcpy_fast(A + offset1, B + offset2, size);
	}
	t2 = gettime() - t2;
	printf("benchmark random access:\n");
	printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1);
}


#ifdef _MSC_VER
#pragma comment(lib, "winmm.lib")
#endif

int main(void)
{
	bench(32, 0x1000000);
	bench(64, 0x1000000);
	bench(512, 0x800000);
	bench(1024, 0x400000);
	bench(4096, 0x80000);
	bench(8192, 0x40000);
	bench(1024 * 1024 * 1, 0x800);
	bench(1024 * 1024 * 4, 0x200);
	bench(1024 * 1024 * 8, 0x100);
	
	random_bench(2048, 8000000);

	return 0;
}




/*
benchmark(size=32 bytes, times=16777216):
result(dst aligned, src aligned): memcpy_fast=78ms memcpy=260 ms
result(dst aligned, src unalign): memcpy_fast=78ms memcpy=250 ms
result(dst unalign, src aligned): memcpy_fast=78ms memcpy=266 ms
result(dst unalign, src unalign): memcpy_fast=78ms memcpy=234 ms

benchmark(size=64 bytes, times=16777216):
result(dst aligned, src aligned): memcpy_fast=109ms memcpy=281 ms
result(dst aligned, src unalign): memcpy_fast=109ms memcpy=328 ms
result(dst unalign, src aligned): memcpy_fast=109ms memcpy=343 ms
result(dst unalign, src unalign): memcpy_fast=93ms memcpy=344 ms

benchmark(size=512 bytes, times=8388608):
result(dst aligned, src aligned): memcpy_fast=125ms memcpy=218 ms
result(dst aligned, src unalign): memcpy_fast=156ms memcpy=484 ms
result(dst unalign, src aligned): memcpy_fast=172ms memcpy=546 ms
result(dst unalign, src unalign): memcpy_fast=172ms memcpy=515 ms

benchmark(size=1024 bytes, times=4194304):
result(dst aligned, src aligned): memcpy_fast=109ms memcpy=172 ms
result(dst aligned, src unalign): memcpy_fast=187ms memcpy=453 ms
result(dst unalign, src aligned): memcpy_fast=172ms memcpy=437 ms
result(dst unalign, src unalign): memcpy_fast=156ms memcpy=452 ms

benchmark(size=4096 bytes, times=524288):
result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms
result(dst aligned, src unalign): memcpy_fast=109ms memcpy=202 ms
result(dst unalign, src aligned): memcpy_fast=94ms memcpy=203 ms
result(dst unalign, src unalign): memcpy_fast=110ms memcpy=218 ms

benchmark(size=8192 bytes, times=262144):
result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms
result(dst aligned, src unalign): memcpy_fast=78ms memcpy=202 ms
result(dst unalign, src aligned): memcpy_fast=78ms memcpy=203 ms
result(dst unalign, src unalign): memcpy_fast=94ms memcpy=203 ms

benchmark(size=1048576 bytes, times=2048):
result(dst aligned, src aligned): memcpy_fast=203ms memcpy=191 ms
result(dst aligned, src unalign): memcpy_fast=219ms memcpy=281 ms
result(dst unalign, src aligned): memcpy_fast=218ms memcpy=328 ms
result(dst unalign, src unalign): memcpy_fast=218ms memcpy=312 ms

benchmark(size=4194304 bytes, times=512):
result(dst aligned, src aligned): memcpy_fast=312ms memcpy=406 ms
result(dst aligned, src unalign): memcpy_fast=296ms memcpy=421 ms
result(dst unalign, src aligned): memcpy_fast=312ms memcpy=468 ms
result(dst unalign, src unalign): memcpy_fast=297ms memcpy=452 ms

benchmark(size=8388608 bytes, times=256):
result(dst aligned, src aligned): memcpy_fast=281ms memcpy=452 ms
result(dst aligned, src unalign): memcpy_fast=280ms memcpy=468 ms
result(dst unalign, src aligned): memcpy_fast=298ms memcpy=514 ms
result(dst unalign, src unalign): memcpy_fast=344ms memcpy=472 ms

benchmark random access:
memcpy_fast=515ms memcpy=1014ms

*/


#endif// __MEMCPY_TESTING_CODE
#endif // ! __HAVE_AVX__
#endif // USE_SKYWIND_FAST_MEMCPY






