/*
 *  Performance Profiling routines
 *
 *  $Id: romeperf.c,v 1.1 2009/11/06 12:26:48 victoryman Exp $
 *
 *  Copyright (c) 2009 Realtek Semiconductor Corp.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <asm/unistd.h>
#include <asm/processor.h>
#include <asm/uaccess.h>
#include <asm/rlxregs.h>
#include <linux/proc_fs.h>
#define rtlglue_malloc(size)	kmalloc(size, 0x1f0)
#define rtlglue_free(p)	kfree(p)
#define rtlglue_printf printk

#define CP3PERF_INDEX_MAX 64

struct cp3perf_stat_s {
	char *desc;
	u64 accCycle[4];
	u64 tempCycle[4];
	u32 executedNum;
	u32 hasTempCycle:1; /* true if tempCycle is valid. */
};
typedef struct cp3perf_stat_s cp3perf_stat_t;

#if defined(CONFIG_CPU_RLX5281) || defined(CONFIG_CPU_RLX4281)


#define CP3CNT_NEW_INST_FECTH 						0x01
#define CP3CNT_NEW_INST_FETCH_CACHE_MISS 			0x02
#define CP3CNT_NEW_INST_MISS_BUSY_CYCLE 			0x03
#define CP3CNT_DATA_STORE_INST 						0x04
#define CP3CNT_DATA_LOAD_INST 						0x05
#define CP3CNT_DATA_LOAD_OR_STORE_INST 				0x06
#define CP3CNT_EXACT_RETIRED_INST 					0x07
#define CP3CNT_CYCLES 								0x08
#define CP3CNT_DATA_LOAD_OR_STORE_CACHE_MISS 		0x0a
#define CP3CNT_DATA_LOAD_OR_STORE_MISS_BUSY_CYCLE	0x0b

#else

#define CP3CNT_CYCLES								0x10
#define CP3CNT_NEW_INST_FECTH						0x11
#define CP3CNT_NEW_INST_FETCH_CACHE_MISS			0x12
#define CP3CNT_NEW_INST_MISS_BUSY_CYCLE				0x13
#define CP3CNT_DATA_STORE_INST						0x14
#define CP3CNT_DATA_LOAD_INST						0x15
#define CP3CNT_DATA_LOAD_OR_STORE_INST				0x16
#define CP3CNT_EXACT_RETIRED_INST					0x17
#define CP3CNT_RETIRED_INST_FOR_PIPE_A				0x18
#define CP3CNT_RETIRED_INST_FOR_PIPE_B				0x19
#define CP3CNT_DATA_LOAD_OR_STORE_CACHE_MISS		0x1a
#define CP3CNT_DATA_LOAD_OR_STORE_MISS_BUSY_CYCLE	0x1b
#define CP3CNT_RESERVED12							0x1c
#define CP3CNT_RESERVED13							0x1d
#define CP3CNT_RESERVED14							0x1e
#define CP3CNT_RESERVED15							0x1f

#endif


/* Global variables */
static cp3perf_stat_t cp3PerfStat[CP3PERF_INDEX_MAX];
static u32 cp3perf_inited = 0;
static u32 cp3perf_enable = 1;



__IRAM static void CP3_COUNTER0_INIT( void )
{
__asm__ __volatile__ \
("  ;\
	mfc0	$8, $12			;\
	la		$9, 0x80000000	;\
	or		$8, $9			;\
	mtc0	$8, $12			;\
");
}

__IRAM static void CP3_COUNTER0_START( void )
{
	u32 reg;
#if 1 /* Inst */
	reg = /* Counter0 */((CP3CNT_CYCLES)<< 0) |
	      /* Counter1 */((CP3CNT_NEW_INST_FECTH)<< 8) |
	      /* Counter2 */((CP3CNT_NEW_INST_FETCH_CACHE_MISS)<<16) |
	      /* Counter3 */((CP3CNT_NEW_INST_MISS_BUSY_CYCLE)<<24);
#elif 1 /* Data (LOAD+STORE) */
	reg = /* Counter0 */((CP3CNT_CYCLES)<< 0) |
	      /* Counter1 */((CP3CNT_DATA_LOAD_OR_STORE_INST)<< 8) |
	      /* Counter2 */((CP3CNT_DATA_LOAD_OR_STORE_CACHE_MISS)<<16) |
	      /* Counter3 */((CP3CNT_DATA_LOAD_OR_STORE_MISS_BUSY_CYCLE)<<24);
#elif 1 /* Data (STORE) */
	reg = /* Counter0 */((CP3CNT_DATA_LOAD_INST)<< 0) |
	      /* Counter1 */((CP3CNT_DATA_STORE_INST)<< 8) |
	      /* Counter2 */((CP3CNT_DATA_LOAD_OR_STORE_CACHE_MISS)<<16) |
	      /* Counter3 */((CP3CNT_DATA_LOAD_OR_STORE_MISS_BUSY_CYCLE)<<24);
#else
#error
#endif	
	__asm__ __volatile__ (
	"	ctc3	%0, $0"
	: : "r"(reg)
	);
}

__IRAM static void CP3_COUNTER0_STOP( void )
{
__asm__ __volatile__ \
("	;\
	ctc3 	$0, $0			;\
");
}

__IRAM static u64 CP3_COUNTER0_GET( void )
{
	u64	counter;
__asm__ __volatile__ (			
	"	mfc3	$9, $8			\n"
	"	sw		$9, 4(%0)		\n"
	"	mfc3	$9, $9			\n"
	"	sw		$9, 0(%0)		\n"
	:
	: "r"(&counter)
);
	return counter;
}

__IRAM u32 test_cp3(void) {
	u32 c;
__asm__ __volatile__ (			
	"	mfc3	$9, $8			\n"
	"	sw		$9, %0			\n"	
	: "=m" (c)
);
	return c;
}

__IRAM static void CP3_COUNTER0_GET_ALL( u64 ptr[4] )
{
	__asm__ (
	"	mfc3	$9,	$9		\n"
	"	sw		$9,	0(%0)	\n"
	"	mfc3	$9,	$8		\n"
	"	sw		$9,	4(%0)	\n"
	"	mfc3	$9,	$11		\n"
	"	sw		$9,	0(%1)	\n"
	"	mfc3	$9,	$10		\n"
	"	sw		$9,	4(%1)	\n"
	"	mfc3	$9,	$13		\n"
	"	sw		$9,	0(%2)	\n"
	"	mfc3	$9,	$12		\n"
	"	sw		$9,	4(%2)	\n"
	"	mfc3	$9,	$15		\n"
	"	sw		$9,	0(%3)	\n"
	"	mfc3	$9,	$14		\n"
	"	sw		$9,	4(%3)	\n"
	:
	: "r"(&ptr[0]), "r"(&ptr[1]), "r"(&ptr[2]), "r"(&ptr[3]) 
	);
}

int cp3perfInit(void)
{
	int i;
	CP3_COUNTER0_INIT();
	CP3_COUNTER0_START();

	cp3perf_inited = 1;
	cp3perf_enable = 1;
	memset( &cp3PerfStat, 0, sizeof( cp3PerfStat ) );

	for (i = 0; i < CP3PERF_INDEX_MAX; i++) {
		cp3PerfStat[i].desc = "";
	}
	return 0;
}

int cp3perfReset(void)
{
	cp3perfInit();

	return 0;
}

int cp3perfPause( void )
{
	if ( !cp3perf_inited ) return -1;

	cp3perf_enable = 0;

	/* Louis patch: someone will disable CP3 in somewhere. */
	CP3_COUNTER0_INIT();

	CP3_COUNTER0_STOP();

	return 0;
}

int cp3perfResume( void )
{
	if ( cp3perf_inited == 0 ) return -1;

	cp3perf_enable = 1;

	/* Louis patch: someone will disable CP3 in somewhere. */
	CP3_COUNTER0_INIT();

	CP3_COUNTER0_START();

	return 0;
}

__IRAM int cp3perfEnter( u32 index )
{
	if ( !cp3perf_inited ||
	     !cp3perf_enable ) return -1;
	if ( index >= (sizeof(cp3PerfStat)/sizeof(cp3perf_stat_t)) )
		return -1;

	/* Louis patch: someone will disable CP3 in somewhere. */
	CP3_COUNTER0_INIT();

	CP3_COUNTER0_STOP();
	CP3_COUNTER0_GET_ALL(cp3PerfStat[index].tempCycle);
	cp3PerfStat[index].hasTempCycle = 1;
	CP3_COUNTER0_START();

	return 0;
}
EXPORT(cp3perfEnter);

__IRAM int cp3perfExit( u32 index )
{
	u64 currCnt[4];
	
	if ( !cp3perf_inited ||
	     !cp3perf_enable ) return -1;
	if ( index >= (sizeof(cp3PerfStat)/sizeof(cp3perf_stat_t)) )
		return -1;
	if ( cp3PerfStat[index].hasTempCycle == 0 )
		return -1;

	/* Louis patch: someone will disable CP3 in somewhere. */
	CP3_COUNTER0_INIT();

	CP3_COUNTER0_STOP();
	CP3_COUNTER0_GET_ALL(currCnt);
	cp3PerfStat[index].accCycle[0] += currCnt[0]-cp3PerfStat[index].tempCycle[0];
	cp3PerfStat[index].accCycle[1] += currCnt[1]-cp3PerfStat[index].tempCycle[1];
	cp3PerfStat[index].accCycle[2] += currCnt[2]-cp3PerfStat[index].tempCycle[2];
	cp3PerfStat[index].accCycle[3] += currCnt[3]-cp3PerfStat[index].tempCycle[3];
	cp3PerfStat[index].hasTempCycle = 0;
	cp3PerfStat[index].executedNum++;
	CP3_COUNTER0_START();

	return 0;
}
EXPORT(cp3perfExit);

int cp3perfDump( int start, int end )
{

	int i;

	cp3perf_stat_t* statSnapShot = rtlglue_malloc(sizeof(cp3perf_stat_t) * (end - start + 1) );
	if( statSnapShot == NULL )
	{
		rtlglue_printf("statSnapShot mem alloc failed\n");
		return -1;
	}

	rtlglue_printf( "index %30s %12s %8s %10s\n", "description", "accCycle", "totalNum", "Average" );

	for( i = start; i <= end; i++ )
	{
		int j;
		for( j =0; j < sizeof(cp3PerfStat[i].accCycle)/sizeof(cp3PerfStat[i].accCycle[0]); j++ )
		{
			statSnapShot[i].accCycle[j]  = cp3PerfStat[i].accCycle[j];
			statSnapShot[i].tempCycle[j] = cp3PerfStat[i].tempCycle[j];
		}
		statSnapShot[i].executedNum  = cp3PerfStat[i].executedNum;
		statSnapShot[i].hasTempCycle = cp3PerfStat[i].hasTempCycle;
	}

	for( i = start; i <= end; i++ )
	{
		if ( statSnapShot[i].executedNum == 0 )
		{
			rtlglue_printf( "[%3d] %30s %12s %8s %10s\n", i, cp3PerfStat[i].desc, "--", "--", "--" );
		}
		else
		{
			int j;
			rtlglue_printf( "[%3d] %30s ", i, cp3PerfStat[i].desc );
			for( j =0; j < sizeof(statSnapShot[i].accCycle)/sizeof(statSnapShot[i].accCycle[0]); j++ )
			{
				u32 *pAccCycle = (u32*)&statSnapShot[i].accCycle[j];
				u32 avrgCycle = /* Hi-word */ (pAccCycle[0]*(0xffffffff/statSnapShot[i].executedNum)) +
				                /* Low-word */(pAccCycle[1]/statSnapShot[i].executedNum);
				rtlglue_printf( "%12llu %8u %10u\n",
				statSnapShot[i].accCycle[j],
				statSnapShot[i].executedNum,
				avrgCycle );
				rtlglue_printf( " %3s  %30s ", "", "" );
			}
			rtlglue_printf( "\r" );
		}
	}

	rtlglue_free(statSnapShot);

	return 0;
}

static int cp3_start=0, cp3_end=9;
static int read_perf_dump(char *page, char **start, off_t off, int count, int *eof, void *data) 
 {
	unsigned long flags;
	printk("Dump range %d-%d\n", cp3_start, cp3_end);
	local_irq_save(flags);
	cp3perfDump(cp3_start, cp3_end);
	local_irq_restore(flags);
	return 0;
}

static int flush_perf_dump(struct file *file, const char __user *buffer, unsigned long count, void *data)
{
	unsigned long flags;
	char cmd[32];
	int val1, val2=0, n;
	
	if (copy_from_user(cmd, buffer, count > sizeof(cmd) ? sizeof(cmd) : count)) {
		return -EFAULT;
	}
	
	if (!strncmp(cmd, "reset", 5)) {
		local_irq_save(flags);
		cp3perfReset();
		local_irq_restore(flags);
	} else if ((n=sscanf(cmd, "%d %d",&val1,&val2))&&(n>0)) {	
		switch(n) {
		case 2:
			if ((val2>CP3PERF_INDEX_MAX)||(val2<0)) return -EINVAL;			
		case 1:
			if ((val1>CP3PERF_INDEX_MAX)||(val1<0)) return -EINVAL;
			if (val2 > val1) {
				cp3_start = val1; 
				cp3_end   = val2;
			} else {
				cp3_start = val2; 
				cp3_end   = val1;
			}
			break;
		default:
			return -EINVAL;
		}
	}
	return count;
};


extern struct proc_dir_entry *realtek_proc;

static int __init cp3perf_init(void) {
	struct proc_dir_entry *pe;
	
	cp3perfInit();
	
	pe = create_proc_entry("cp3perf", S_IRUSR |S_IWUSR | S_IRGRP | S_IROTH, realtek_proc);
	if (!pe) {
		return -EINVAL;
	}
	
	pe->read_proc  = read_perf_dump;
	pe->write_proc = flush_perf_dump;
	return 0;
}

static void __exit cp3perf_exit(void) {
}

module_init(cp3perf_init); 
module_exit(cp3perf_exit);