#include <linux/module.h> 
#include <linux/kernel.h> 
#include <linux/init.h> 
#include <linux/proc_fs.h> 
#include <linux/seq_file.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/percpu.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <bspchip.h>

#define HWPERF_INDEX_MAX 8

struct hwperf_stat_s {	
	u64 accCycle[2];
	u32 prevCount[2];
	u32 executedNum;	
};
typedef struct hwperf_stat_s hwperf_stat_t;


DEFINE_PER_CPU(hwperf_stat_t[HWPERF_INDEX_MAX], hwperf_stats);
static u32 perfctrl0, perfctrl1;
static u32 perfevent, perfmask;

static void hwperf_clear_all(void) {
	unsigned int cpu;
	hwperf_stat_t *ptr;
	for_each_possible_cpu(cpu) {
		ptr = per_cpu_ptr(hwperf_stats, cpu);
		memset(ptr, 0, sizeof(hwperf_stat_t[HWPERF_INDEX_MAX]));
	}
}

static void inline hwperf_ctrl_update(u32 event, u32 mask) {
	perfctrl0 = ((mask)&0x1F) | ((event&0x00ff) << 5);
	perfctrl1 = ((mask)&0x1F) | ((event&0xff00) >> 3);
}

/*
static inline void ___hwperf_enable(void) {
	asm (
		"	.set push			\n"
		"	.set noreorder		\n"
		"	mtc0	$0, $25, 0	\n"
		"	mtc0	$0, $25, 2	\n"
		"	mtc0	$0, $25, 1	\n"
		"	mtc0	$0, $25, 3	\n"
		"	mtc0	%0, $25, 0	\n"		
		"	mtc0	%1, $25, 2	\n"		
		"	.set pop			\n"
		:
		: "r" (perfctrl0), 
		  "r" (perfctrl1)
	);
}
*/

static inline void ___hwperf_disable(void) {
	asm (
		"	.set push			\n"
		"	.set noreorder		\n"
		"	mtc0	$0, $25, 0	\n"
		"	mtc0	$0, $25, 2	\n"
		"	mtc0	$0, $25, 1	\n"
		"	mtc0	$0, $25, 3	\n"		
		"	.set pop			\n"		
	);
}



void hwperf_start(int index) {
	unsigned long flags;
	hwperf_stat_t *ptr;
	local_irq_save(flags);
	ptr = this_cpu_ptr(hwperf_stats);
	write_c0_perfctrl0(0);
	write_c0_perfctrl1(0);
	wmb();
	ptr[index].prevCount[0] = read_c0_perfcntr0();
	ptr[index].prevCount[1] = read_c0_perfcntr1();
	wmb();
	write_c0_perfctrl0(perfctrl0);
	write_c0_perfctrl1(perfctrl1);
	local_irq_restore(flags);
}

void hwperf_stop(int index) 
{
	u32 tmp0, tmp1;
	unsigned long flags;
	hwperf_stat_t *ptr;
	
	local_irq_save(flags);
	asm (
		"	.set push			\n"
		"	.set noreorder		\n"
		"	mtc0	$0, $25, 0	\n"
		"	mtc0	$0, $25, 2	\n"
		"	mfc0	%0, $25, 1	\n"
		"	mfc0	%1, $25, 3	\n"			
		"	.set pop			\n"	
		: "=r" (tmp0), "=r" (tmp1)
	);
		
	ptr = this_cpu_ptr(hwperf_stats);
	
	tmp0 = tmp0 - ptr[index].prevCount[0];
	tmp1 = tmp1 - ptr[index].prevCount[1];
	
	ptr[index].accCycle[0] += tmp0;
	ptr[index].accCycle[1] += tmp1;
	ptr[index].executedNum++;	
	local_irq_restore(flags);
}

static int hwperf_proc_read(struct seq_file *f, void *data) {	
	int i;	
	u8 event0, event1;
	seq_printf(f, "event: %x, mask %x\n", perfevent, perfmask);
	event0 = perfevent&0xff; 
	event1 = perfevent>>8;
	seq_printf(f, "id P    Event[%2d]      Event[%2d]         Exec       	  Avg[%2d]         Avg[%2d]\n",event1,event0,event1,event0);
	seq_printf(f, "-- -   -------------   -------------  -------------  -------------  -------------\n");
	for (i=0; i<HWPERF_INDEX_MAX;i++) {
		u64 avrgCycle[2] = { 0, 0 };
		hwperf_stat_t *pstat;
		unsigned int cpu;
		int isprinted;
		
		isprinted = 0;
		
		for_each_possible_cpu(cpu) {
			pstat = &(per_cpu_ptr(hwperf_stats, cpu)[i]);
			
			if (pstat->executedNum) {
				avrgCycle[0] = div64_u64(pstat->accCycle[0],pstat->executedNum);
				avrgCycle[1] = div64_u64(pstat->accCycle[1],pstat->executedNum);			
			} else 
				continue;
			
			if(isprinted==0) {
				seq_printf(f, "%2d ", i);
				isprinted=1;
			} else 
				seq_printf(f, "   ");
			
			seq_printf(f, "%d  %15llu %15llu %10u %15llu %15llu\n", cpu, pstat->accCycle[1], pstat->accCycle[0], pstat->executedNum, avrgCycle[1], avrgCycle[0]);
		}
	}
	return 0;
}

static int read_proc_open_hwperf(struct inode *inode, struct file *file) {
	return(single_open(file, hwperf_proc_read, NULL));
}

static ssize_t write_proc_hwperf(struct file *file, const char __user * userbuf, size_t count, loff_t * off) {	
	char proc_buffer[count];
	char *strptr;
	char *cmdptr;
	
	/* write data to the buffer */
	memset(proc_buffer, 0, sizeof(proc_buffer));
	if ( copy_from_user(proc_buffer, userbuf, count) ) {
		return -EFAULT;
	}

	proc_buffer[count] = '\0';

	strptr = proc_buffer;
	if (strlen(strptr) == 0) {
		goto errout;
	}

	cmdptr = strsep(&strptr," ");
	if (cmdptr==NULL)
	{
		goto errout;
	}

	/*parse command*/
	if (strncmp(cmdptr, "clear",5) == 0) {
		hwperf_clear_all();
	}	
	else if (strncmp(cmdptr, "type",4) == 0) {
		int e0, e1;
		cmdptr = strsep(&strptr," ");
		if (cmdptr==NULL)
			goto errout;
		
		if (sscanf(cmdptr,"%d,%d",&e1,&e0)!=2)
			goto errout;
		
		perfevent = (e1&0xff)<<8 | (e0&0xff);		
		hwperf_ctrl_update(perfevent, perfmask);
	} 
	else if (strncmp(cmdptr, "mask",4) == 0) {
		int mask;
		cmdptr = strsep(&strptr," ");
		if (cmdptr==NULL)
			goto errout;
		
		if (sscanf(cmdptr,"%x",&mask)!=1)
			goto errout;
		
		perfmask = mask;
		hwperf_ctrl_update(perfevent, perfmask);
	} 
	else if (strncmp(cmdptr, "test",4) == 0) {

		do { /* Test delay */
			printk("Test0 mdelay(100)\n");
			hwperf_start(0);
			mdelay(100);
			hwperf_stop(0);
		} while (0);
		
		do { /* Test D-cache access */
			void *kbuf;
			volatile u32 addr, val;
			int  iter, size;
			struct cpuinfo_mips *c = &current_cpu_data;
			
			printk("Test1 D-cache\n");
			size = c->dcache.sets * c->dcache.ways * c->dcache.linesz;
			kbuf = kmalloc(size, GFP_KERNEL);
			if (kbuf) {
				for (iter=0; iter<1000; iter++) {
					dma_cache_inv((unsigned long)kbuf, size);
					addr = (u32)kbuf;
					hwperf_start(1);
					for (addr = (u32)kbuf; addr < ((u32)kbuf+size); addr += 32) 
						val = REG32(addr);
					hwperf_stop(1);
				}	
				kfree(kbuf);
			} else {
				printk("Fail to alloc %dkB memory", size >> 10);
			}
		} while (0);
		
		do {
			printk("Test2 hwperf baseline\n");
			hwperf_start(2);			
			hwperf_stop(2);
		} while (0);
	} 
	else goto errout;
	
	return count;	
errout:	
	printk("Invalid args!\n\n");	
	printk("Usage:\n");
	printk("echo clear : reset hardware counter\n");
	printk("echo type <event1>,<event0> : set perf type\n");
	
	return count;	
}

static struct file_operations fops_hwperf = {
	.open     = read_proc_open_hwperf,
	.read     = seq_read,
	.llseek   = seq_lseek,
	.release  = single_release,
	.write	  = write_proc_hwperf,
};

static int __init hwperf_init(void)
{
	extern struct proc_dir_entry *realtek_proc;
	
	perfevent = 0x100;
	perfmask  = 0x6;
	hwperf_ctrl_update(perfevent, perfmask);
	hwperf_clear_all();
	proc_create_data("hwperf", 0644, realtek_proc, &fops_hwperf, NULL);
	return 0;
}

module_init(hwperf_init);