/*
* Copyright c                  Realtek Semiconductor Corporation, 2002  
* All rights reserved.
* 
* Program : Performance Profiling for ROME Driver
* Abstract : 
* Author : Yung-Chieh Lo (yjlou@realtek.com.tw)               
* $Id: romeperf.c,v 1.4 2012/01/11 12:51:01 krammer Exp $
*/

#include "romeperf.h"
#include <net/rtl/rtl_glue.h>
#define KERNEL_SYSeALLS 
#include <asm/unistd.h>
#include <asm/processor.h>
#include <asm/uaccess.h>
//#include <asm/mipsregs.h>
#include <linux/interrupt.h>

#define rtlglue_malloc(size)	kmalloc(size, 0x1f0)
#define rtlglue_free(p)	kfree(p)
#define rtlglue_printf printk

enum CP3_COUNTER
{
 CP3CNT_STOP = 0,
 CP3CNT_NEW_INST_FECTH,//1
 CP3CNT_NEW_INST_FETCH_CACHE_MISS,//2
 CP3CNT_NEW_INST_MISS_BUSY_CYCLE,//3
 CP3CNT_DATA_STORE_INST,//4
 CP3CNT_DATA_LOAD_INST,//5
 CP3CNT_DATA_LOAD_OR_STORE_INST,//6
 CP3CNT_COMPLETED_INST,//7
 CP3CNT_CYCLES,//8
 CP3CNT_I_CACHE_SOFT_MISS,//9
 CP3CNT_DATA_LOAD_OR_STORE_CACHE_MISS,//10
 CP3CNT_DATA_LOAD_OR_STORE_MISS_BUSY_CYCLE,//11
 CP3CNT_RESERVED12,
 CP3CNT_RESERVED13,
 CP3CNT_RESERVED14,
 CP3CNT_RESERVED15,
};


/* Local variables */
volatile static uint64 tempVariable64;
static uint32 tempVariable32;
static uint64 currCnt[4];

/* Global variables */
uint64 cnt1, cnt2;
rtl8651_romeperf_stat_t romePerfStat[ROMEPERF_INDEX_MAX];
uint32 rtl8651_romeperf_inited = 0;
uint32 rtl8651_romeperf_enable = TRUE;

__IRAM void CP3_COUNTER0_INIT( void )
{
__asm__ __volatile__ \
("  ;\
	mfc0	$8, $12			;\
	la		$9, 0x80000000	;\
	or		$8, $9			;\
	mtc0	$8, $12			;\
");
}

__IRAM uint32 CP3_COUNTER0_IS_INITED( void )
{
__asm__ __volatile__ \
("  ;\
	mfc0	$8, $12			;\
	la		$9, tempVariable32;\
	sw		$8, 0($9)		;\
");
	return tempVariable32;
}

__IRAM void CP3_COUNTER0_START( void )
{
#if 1 /* Inst */
	tempVariable32 = /* Counter0 */((CP3CNT_CYCLES)<< 0) |
	                 /* Counter1 */((CP3CNT_NEW_INST_FECTH)<< 8) |
	                 /* Counter2 */((CP3CNT_NEW_INST_FETCH_CACHE_MISS)<<16) |
	                 /* Counter3 */((CP3CNT_NEW_INST_MISS_BUSY_CYCLE)<<24);
#elif 1 /* Data (LOAD+STORE) */
	tempVariable32 = /* Counter0 */((CP3CNT_CYCLES)<< 0) |
	                 /* Counter1 */((CP3CNT_DATA_LOAD_OR_STORE_INST)<< 8) |
	                 /* Counter2 */((CP3CNT_DATA_LOAD_OR_STORE_CACHE_MISS)<<16) |
	                 /* Counter3 */((CP3CNT_DATA_LOAD_OR_STORE_MISS_BUSY_CYCLE)<<24);
#elif 1 /* Data (STORE) */
	tempVariable32 = /* Counter0 */((CP3CNT_DATA_LOAD_INST)<< 0) |
	                 /* Counter1 */((CP3CNT_DATA_STORE_INST)<< 8) |
	                 /* Counter2 */((CP3CNT_DATA_LOAD_OR_STORE_CACHE_MISS)<<16) |
	                 /* Counter3 */((CP3CNT_DATA_LOAD_OR_STORE_MISS_BUSY_CYCLE)<<24);
#else
#error
#endif

__asm__ __volatile__ \
("  ;\
	la		$8, tempVariable32	;\
	lw		$8, 0($8)			;\
	ctc3 	$8, $0				;\
");
}

__IRAM void CP3_COUNTER0_STOP( void )
{
__asm__ __volatile__ \
("	;\
	ctc3 	$0, $0			;\
");
}

__IRAM uint64 CP3_COUNTER0_GET( void )
{
__asm__ __volatile__ \
("	;\
	la		$8, tempVariable64;\
	mfc3	$9, $9			;\
	sw		$9, 0($8)		;\
	mfc3	$9, $8			;\
	sw		$9, 4($8)		;\
");
	return tempVariable64;
}

__IRAM void CP3_COUNTER0_GET_ALL( void )
{
__asm__ __volatile__ \
("	;\
	la		$4, currCnt		;\
	mfc3	$9, $9			;\
	sw		$9, 0x00($4)	;\
	mfc3	$9, $8			;\
	sw		$9, 0x04($4)	;\
	mfc3	$9, $11			;\
	sw		$9, 0x08($4)	;\
	mfc3	$9, $10			;\
	sw		$9, 0x0C($4)	;\
	mfc3	$9, $13			;\
	sw		$9, 0x10($4)	;\
	mfc3	$9, $12			;\
	sw		$9, 0x14($4)	;\
	mfc3	$9, $15			;\
	sw		$9, 0x18($4)	;\
	mfc3	$9, $14			;\
	sw		$9, 0x1C($4)	;\
");
}

int32 rtl8651_romeperfInit()
{
	CP3_COUNTER0_INIT();
	CP3_COUNTER0_START();

	rtl8651_romeperf_inited = TRUE;
	rtl8651_romeperf_enable = TRUE;
	memset( &romePerfStat, 0, sizeof( romePerfStat ) );

	romePerfStat[ROMEPERF_INDEX_NAPT_ADD].desc = "NAPT add_all";
	romePerfStat[ROMEPERF_INDEX_NAPT_ADD_1].desc = "NAPT add_checkIntIP";
	romePerfStat[ROMEPERF_INDEX_NAPT_ADD_2].desc = "NAPT add_localServer";
	romePerfStat[ROMEPERF_INDEX_NAPT_ADD_3].desc = "NAPT add_checkExtIp";
	romePerfStat[ROMEPERF_INDEX_NAPT_ADD_4].desc = "NAPT add_dupCheck1";
	romePerfStat[ROMEPERF_INDEX_NAPT_ADD_5].desc = "NAPT add_dupCheck2";
	romePerfStat[ROMEPERF_INDEX_NAPT_ADD_6].desc = "NAPT add_bPortReused";
	romePerfStat[ROMEPERF_INDEX_NAPT_ADD_7].desc = "NAPT add_routeCache";
	romePerfStat[ROMEPERF_INDEX_NAPT_ADD_8].desc = "NAPT add_tooManyConn";
	romePerfStat[ROMEPERF_INDEX_NAPT_ADD_9].desc = "NAPT add_initConn";
	romePerfStat[ROMEPERF_INDEX_NAPT_ADD_10].desc = "NAPT add_decisionFlo";
	romePerfStat[ROMEPERF_INDEX_NAPT_ADD_11].desc = "NAPT add_ambiguous";
	romePerfStat[ROMEPERF_INDEX_NAPT_DEL].desc = "NAPT del";
	romePerfStat[ROMEPERF_INDEX_NAPT_FIND_OUTBOUND].desc = "NATP outbound";
	romePerfStat[ROMEPERF_INDEX_NAPT_FIND_INBOUND].desc = "NAPT inbound";
	romePerfStat[ROMEPERF_INDEX_NAPT_UPDATE].desc = "NAPT update";
	romePerfStat[ROMEPERF_INDEX_UNTIL_RXTHREAD].desc = "IntDispatch-RxThread";
	romePerfStat[ROMEPERF_INDEX_RECVLOOP].desc = "RecvLoop-FwdInput";
	romePerfStat[ROMEPERF_INDEX_FWDENG_INPUT].desc = "FwdEng_Input()";
	romePerfStat[ROMEPERF_INDEX_BEFORE_CRYPTO_ENCAP].desc = "FwdInput-Crypto(En)";
	romePerfStat[ROMEPERF_INDEX_ENCAP].desc = "IPSEC Encap";
	romePerfStat[ROMEPERF_INDEX_ENCAP_CRYPTO_ENGINE].desc = "Encap Crypto";
	romePerfStat[ROMEPERF_INDEX_ENCAP_AUTH_ENGINE].desc = "Encap Authtication";
	romePerfStat[ROMEPERF_INDEX_BEFORE_CRYPTO_DECAP].desc = "FwdInput-Crypto(De)";
	romePerfStat[ROMEPERF_INDEX_DECAP].desc = "IPSEC Decap";
	romePerfStat[ROMEPERF_INDEX_DECAP_CRYPTO_ENGINE].desc = "Decap Crypto";
	romePerfStat[ROMEPERF_INDEX_DECAP_AUTH_ENGINE].desc = "Decap Authtication";
	romePerfStat[ROMEPERF_INDEX_FASTPATH].desc = "Fast Path";
	romePerfStat[ROMEPERF_INDEX_SLOWPATH].desc = "Slow Path";
	romePerfStat[ROMEPERF_INDEX_FWDENG_SEND].desc = "FwdEngSend()";
	romePerfStat[ROMEPERF_INDEX_UNTIL_ACLDB].desc = "FwdInput() Until ACLDB";
	romePerfStat[ROMEPERF_INDEX_GET_MTU_AND_SOURCE_MAC].desc = "L3Route_MTU_srcMAC";
	romePerfStat[ROMEPERF_INDEX_PPTPL2TP_1].desc = "L3Route_PPTPL2TP_1";
	romePerfStat[ROMEPERF_INDEX_PPPOE_ARP_CACHE].desc = "L3Route_PPPoE_ArpCache";
	romePerfStat[ROMEPERF_INDEX_PPTPL2TP_SEND].desc = "L3Route_PptpL2tpSend()";
	romePerfStat[ROMEPERF_INDEX_FRAG].desc = "L3Route_Fragment";
	romePerfStat[ROMEPERF_INDEX_EGRESS_ACL].desc = "FwdSend_EgressACL";
	romePerfStat[ROMEPERF_INDEX_PPTPL2TP_ENCAP].desc = "FwdSend_PPTP/L2TP_Encap";
	romePerfStat[ROMEPERF_INDEX_FROM_PS].desc = "FwdSend_FromPS";
	romePerfStat[ROMEPERF_INDEX_EXTDEV_SEND].desc = "FwdSend_ExtDevSend()";
	romePerfStat[ROMEPERF_INDEX_FRAG_2ND_HALF].desc = "FwdSend_Frag_2ndHalf()";
	romePerfStat[ROMEPERF_INDEX_TXPKTPOST].desc = "rtl8651_txPktPostProcessing()";
	romePerfStat[ROMEPERF_INDEX_MBUFPAD].desc = "mBuf_padding()";
	romePerfStat[ROMEPERF_INDEX_TXALIGN].desc = "_swNic_txAlign";
	romePerfStat[ROMEPERF_INDEX_ISRTXRECYCLE].desc = "_swNic_isrTxRecycle";
	romePerfStat[ROMEPERF_INDEX_16].desc = "FwdEng_temp_16";
	romePerfStat[ROMEPERF_INDEX_17].desc = "FwdEng_temp_17";
	romePerfStat[ROMEPERF_INDEX_18].desc = "FwdEng_temp_18";
        romePerfStat[ROMEPERF_INDEX_19].desc = "FwdEng_temp_19";
        romePerfStat[ROMEPERF_INDEX_20].desc = "FwdEng_temp_20";
        romePerfStat[ROMEPERF_INDEX_21].desc = "FwdEng_temp_21";
        romePerfStat[ROMEPERF_INDEX_22].desc = "FwdEng_temp_22";
        romePerfStat[ROMEPERF_INDEX_23].desc = "FwdEng_temp_23";
        romePerfStat[ROMEPERF_INDEX_24].desc = "FwdEng_temp_24";
        romePerfStat[ROMEPERF_INDEX_25].desc = "FwdEng_temp_25";
        romePerfStat[ROMEPERF_INDEX_FLUSHDCACHE].desc = "rtlglue_flushDCache";
        romePerfStat[ROMEPERF_INDEX_IRAM_1].desc = "IRAM Cached within IRAM";
        romePerfStat[ROMEPERF_INDEX_IRAM_2].desc = "IRAM Uncached within IRAM";
        romePerfStat[ROMEPERF_INDEX_IRAM_3].desc = "test ICACHE  (1024*100)";
        romePerfStat[ROMEPERF_INDEX_IRAM_4].desc = "test Uncached (1024*10)";
        romePerfStat[ROMEPERF_INDEX_DRAM_1].desc = "DRAM Cached within DRAM";
        romePerfStat[ROMEPERF_INDEX_DRAM_2].desc = "DRAM Uncached within DRAM";
        romePerfStat[ROMEPERF_INDEX_DRAM_3].desc = "test DCACHE  (1024*100)";
        romePerfStat[ROMEPERF_INDEX_DRAM_4].desc = "test Uncached (1024*10)";
        romePerfStat[ROMEPERF_INDEX_BMP].desc = "KMP Algorithm";
        romePerfStat[ROMEPERF_INDEX_MDCMDIO].desc = "MDCMDIO PHY Register ACCESS";
        romePerfStat[ROMEPERF_INDEX_TEST0].desc = "TEST 0";
        romePerfStat[ROMEPERF_INDEX_TEST1].desc = "TEST 1";
        romePerfStat[ROMEPERF_INDEX_TEST2].desc = "TEST 2";
        romePerfStat[ROMEPERF_INDEX_TEST3].desc = "TEST 3";
        romePerfStat[ROMEPERF_INDEX_TEST4].desc = "TEST 4";
	 romePerfStat[ROMEPERF_INDEX_PCIE_IRQ].desc = "PCIE_IRQ";
	romePerfStat[ROMEPERF_INDEX_RX_DSR].desc = "RX DSR";
	romePerfStat[ROMEPERF_INDEX_RX_ISR].desc = "RX ISR";
	romePerfStat[ROMEPERF_INDEX_validate_mpdu].desc = "validate mpdu";  
	romePerfStat[ROMEPERF_INDEX_swNic_receive].desc = "swNic_receive";
	romePerfStat[ROMEPERF_INDEX_rtl_processReceivedInfo].desc = "rtl_processReceivedInfo";
	romePerfStat[ROMEPERF_INDEX_rtl_decideRxDevice].desc = "rtl_decideRxDevice";
	romePerfStat[ROMEPERF_INDEX_rtl_processRxFrame].desc = "rtl_processRxFrame";
	romePerfStat[ROMEPERF_INDEX_rtl_processBridgeShortCut].desc = "rtl_processBridgeShortCut";
	romePerfStat[ROMEPERF_INDEX_rtl_processRxToProtcolStack].desc = "rtl_processRxToProtcolStack";
	romePerfStat[ROMEPERF_INDEX_netif_receive_skb].desc = "netif_receive_skb";
	romePerfStat[ROMEPERF_INDEX_FastPath_Enter].desc = "FastPath_Enter";
	romePerfStat[ROMEPERF_INDEX_checksum_in_fastpath].desc = "checksum_in_fastpath";
	romePerfStat[ROMEPERF_INDEX_enter_fast_path].desc = "enter_fast_path";
	romePerfStat[ROMEPERF_INDEX_ip_finish_output3].desc = "ip_finish_output3";
	romePerfStat[ROMEPERF_INDEX_ndo_start_xmit].desc = "ndo_start_xmit";
	romePerfStat[ROMEPERF_INDEX_rtl_preProcess_xmit].desc = "rtl_preProcess_xmit";
	romePerfStat[ROMEPERF_INDEX_rtl_fill_txInfo].desc = "rtl_fill_txInfo";
	romePerfStat[ROMEPERF_INDEX_swNic_send].desc = "swNic_send";
	romePerfStat[ROMEPERF_INDEX_swNic_txDone].desc = "swNic_txDone";
	romePerfStat[ROMEPERF_INDEX_dev_kfree_skb_any].desc = "dev_kfree_skb_any"	;
	romePerfStat[ROMEPERF_INDEX_fastpath_1].desc = "fastpath_1";
	romePerfStat[ROMEPERF_INDEX_fastpath_2].desc = "fastpath_2";
	romePerfStat[ROMEPERF_INDEX_fastpath_3].desc = "fastpath_3";
 
 
        return SUCCESS;
}

int32 rtl8651_romeperfReset()
{
	rtl8651_romeperfInit();
	
	return SUCCESS;
}

#if 0/* old fashion function, for reference only. */
int32 rtl8651_romeperfStart()
{
	if ( rtl8651_romeperf_inited == FALSE ) rtl8651_romeperfInit();
	
	START_AND_GET_CP3_COUNTER0( cnt1 );

	return SUCCESS;
}

int32 rtl8651_romeperfStop( uint64 *pDiff )
{
	if ( rtl8651_romeperf_inited == FALSE ) rtl8651_romeperfInit();
	
	STOP_AND_GET_CP3_COUNTER0( cnt2 );

	*pDiff = cnt2 - cnt1;
	return SUCCESS;
}
#endif

int32 rtl8651_romeperfGet( uint64 *pGet )
{
	if ( rtl8651_romeperf_inited == FALSE ) return FAILED;

	/* Louis patch: someone will disable CP3 in somewhere. */
	CP3_COUNTER0_INIT();

	CP3_COUNTER0_STOP();
	*pGet = CP3_COUNTER0_GET();
	CP3_COUNTER0_START();
	
	return SUCCESS;
}

int32 rtl8651_romeperfPause( void )
{
	if ( rtl8651_romeperf_inited == FALSE ) return FAILED;

	rtl8651_romeperf_enable = FALSE;
	
	/* Louis patch: someone will disable CP3 in somewhere. */
	CP3_COUNTER0_INIT();

	CP3_COUNTER0_STOP();
	
	return SUCCESS;
}

int32 rtl8651_romeperfResume( void )
{
	if ( rtl8651_romeperf_inited == FALSE ) return FAILED;

	rtl8651_romeperf_enable = TRUE;
	
	/* Louis patch: someone will disable CP3 in somewhere. */
	CP3_COUNTER0_INIT();
 	
	CP3_COUNTER0_START();
	
	return SUCCESS;
}

__IRAM int rtl8651_romeperfEnterPoint( int index )
{
	if ( rtl8651_romeperf_inited == FALSE ||
	     rtl8651_romeperf_enable == FALSE ) return FAILED;
	if ( index >= (sizeof(romePerfStat)/sizeof(rtl8651_romeperf_stat_t)) )
		return FAILED;

	/* Louis patch: someone will disable CP3 in somewhere. */
	CP3_COUNTER0_INIT();

	CP3_COUNTER0_STOP();
	CP3_COUNTER0_GET_ALL();
	romePerfStat[index].tempCycle[0] = currCnt[0];
	romePerfStat[index].tempCycle[1] = currCnt[1];
	romePerfStat[index].tempCycle[2] = currCnt[2];
	romePerfStat[index].tempCycle[3] = currCnt[3];
	romePerfStat[index].hasTempCycle = TRUE;
	CP3_COUNTER0_START();
	
	return SUCCESS;
}

__IRAM int rtl8651_romeperfExitPoint(int index )
{
	if ( rtl8651_romeperf_inited == FALSE ||
	     rtl8651_romeperf_enable == FALSE ) return FAILED;
	if ( index >= (sizeof(romePerfStat)/sizeof(rtl8651_romeperf_stat_t)) )
		return FAILED;
	if ( romePerfStat[index].hasTempCycle == FALSE )
		return FAILED;

	/* Louis patch: someone will disable CP3 in somewhere. */
	CP3_COUNTER0_INIT();
	
	CP3_COUNTER0_STOP();
	CP3_COUNTER0_GET_ALL();
	romePerfStat[index].accCycle[0] += currCnt[0]-romePerfStat[index].tempCycle[0];
	romePerfStat[index].accCycle[1] += currCnt[1]-romePerfStat[index].tempCycle[1];
	romePerfStat[index].accCycle[2] += currCnt[2]-romePerfStat[index].tempCycle[2];
	romePerfStat[index].accCycle[3] += currCnt[3]-romePerfStat[index].tempCycle[3];
	romePerfStat[index].hasTempCycle = FALSE;
	romePerfStat[index].executedNum++;
	CP3_COUNTER0_START();
	
	return SUCCESS;
}

int32 rtl8651_romeperfDump( int start, int end )
{
#if 0
	int i;

	rtlglue_printf( "index %30s %12s %8s %10s\n", "description", "accCycle", "totalNum", "Average" );
	for( i = start; i <= end; i++ )
	{
		if ( romePerfStat[i].executedNum == 0 )
		{
			rtlglue_printf( "[%3d] %30s %12s %8s %10s\n", i, romePerfStat[i].desc, "--", "--", "--" );
		}
		else
		{
			int j;
			rtlglue_printf( "[%3d] %30s ", 
			                i, romePerfStat[i].desc );
			for( j =0; j < sizeof(romePerfStat[i].accCycle)/sizeof(romePerfStat[i].accCycle[0]);
			     j++ )
			{
				uint32 *pAccCycle = (uint32*)&romePerfStat[i].accCycle[j];
				uint32 avrgCycle = /* Hi-word */ (pAccCycle[0]*(0xffffffff/romePerfStat[i].executedNum)) +
				                   /* Low-word */(pAccCycle[1]/romePerfStat[i].executedNum);

				rtlglue_printf( "%12llu %8u %10u\n",
				                romePerfStat[i].accCycle[j],
				                romePerfStat[i].executedNum, 
				                avrgCycle
				                );
				rtlglue_printf( " %3s  %30s ", "", "" );
			}
			rtlglue_printf( "\r" );
		}
	}
	
	return SUCCESS;
#else
	int i;

	rtl8651_romeperf_stat_t* statSnapShot = rtlglue_malloc(sizeof(rtl8651_romeperf_stat_t) * (end - start + 1) );
	if( statSnapShot == NULL )
	{
		rtlglue_printf("statSnapShot mem alloc failed\n");
		return FAILED;
	}

	rtlglue_printf( "index %30s %12s %8s %10s\n", "description", "accCycle", "totalNum", "Average" );

	for( i = start; i <= end; i++ )
	{
		int j;
		for( j =0; j < sizeof(romePerfStat[i].accCycle)/sizeof(romePerfStat[i].accCycle[0]); j++ )
		{
			statSnapShot[i].accCycle[j]  = romePerfStat[i].accCycle[j];
			statSnapShot[i].tempCycle[j] = romePerfStat[i].tempCycle[j];
		}
		statSnapShot[i].executedNum  = romePerfStat[i].executedNum;
		statSnapShot[i].hasTempCycle = romePerfStat[i].hasTempCycle;
	}

	for( i = start; i <= end; i++ )
	{
		if ( statSnapShot[i].executedNum == 0 )
		{
			rtlglue_printf( "[%3d] %30s %12s %8s %10s\n", i, romePerfStat[i].desc, "--", "--", "--" );
		}
		else
		{
			int j;
			rtlglue_printf( "[%3d] %30s ", i, romePerfStat[i].desc );
			for( j =0; j < sizeof(statSnapShot[i].accCycle)/sizeof(statSnapShot[i].accCycle[0]); j++ )
			{
				uint32 *pAccCycle = (uint32*)&statSnapShot[i].accCycle[j];
				uint32 avrgCycle = /* Hi-word */ (pAccCycle[0]*(0xffffffff/statSnapShot[i].executedNum)) +
				                   /* Low-word */(pAccCycle[1]/statSnapShot[i].executedNum);
				rtlglue_printf( "%12llu %8u %10u\n",
				statSnapShot[i].accCycle[j], 
				statSnapShot[i].executedNum, 
				avrgCycle );
				rtlglue_printf( " %3s  %30s ", "", "" );
			}
			rtlglue_printf( "\r" );
		}
	}

	rtlglue_free(statSnapShot);
	
	return SUCCESS;
#endif
}

int rtl865x_perf_proc_read( char *page, char **start, off_t off, int count, int *eof, void *data )
{
	//unsigned long x;
	//spin_lock_irq(x);

	rtl8651_romeperfDump(79, 98);

	//spin_unlock_irq(x);
    return count;
}

int rtl865x_perf_proc_write(struct file *file, const char *buffer,
              unsigned long count, void *data)
{
	unsigned long x;
	spin_lock_irq(x);

	rtl8651_romeperfReset();

	spin_unlock_irq(x);
    return count;
}