/*
 * BK Id: SCCS/s.smp.c 1.37 11/23/01 16:38:30 paulus
 */
/*
 * Smp support for ppc.
 *
 * Written by Cort Dougan (cort@cs.nmt.edu) borrowing a great
 * deal of code from the sparc and intel versions.
 *
 * Copyright (C) 1999 Cort Dougan <cort@cs.nmt.edu>
 *
 */

#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/delay.h>
#define __KERNEL_SYSCALLS__
#include <linux/unistd.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/cache.h>

#include <asm/ptrace.h>
#include <asm/atomic.h>
#include <asm/irq.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/hardirq.h>
#include <asm/softirq.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/smp.h>
#include <asm/residual.h>
#include <asm/time.h>

int smp_threads_ready;
volatile int smp_commenced;
int smp_num_cpus = 1;
int smp_tb_synchronized;
struct cpuinfo_PPC cpu_data[NR_CPUS];
struct klock_info_struct klock_info = { KLOCK_CLEAR, 0 };
atomic_t ipi_recv;
atomic_t ipi_sent;
spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
unsigned int prof_multiplier[NR_CPUS];
unsigned int prof_counter[NR_CPUS];
cycles_t cacheflush_time;
static int max_cpus __initdata = NR_CPUS;
unsigned long cpu_online_map;
int smp_hw_index[NR_CPUS];
static struct smp_ops_t *smp_ops;

/* all cpu mappings are 1-1 -- Cort */
volatile unsigned long cpu_callin_map[NR_CPUS];

#define TB_SYNC_PASSES 4
volatile unsigned long __initdata tb_sync_flag = 0;
volatile unsigned long __initdata tb_offset = 0;

int start_secondary(void *);
extern int cpu_idle(void *unused);
void smp_call_function_interrupt(void);
void smp_message_pass(int target, int msg, unsigned long data, int wait);

/* Since OpenPIC has only 4 IPIs, we use slightly different message numbers.
 * 
 * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
 * in /proc/interrupts will be wrong!!! --Troy */
#define PPC_MSG_CALL_FUNCTION	0
#define PPC_MSG_RESCHEDULE	1
#define PPC_MSG_INVALIDATE_TLB	2
#define PPC_MSG_XMON_BREAK	3

#define smp_message_pass(t,m,d,w) \
    do { if (smp_ops) \
	     atomic_inc(&ipi_sent); \
	     smp_ops->message_pass((t),(m),(d),(w)); \
       } while(0)

/* 
 * Common functions
 */
void smp_local_timer_interrupt(struct pt_regs * regs)
{
	int cpu = smp_processor_id();

	if (!--prof_counter[cpu]) {
		update_process_times(user_mode(regs));
		prof_counter[cpu]=prof_multiplier[cpu];
	}
}

void smp_message_recv(int msg, struct pt_regs *regs)
{
	atomic_inc(&ipi_recv);
	
	switch( msg ) {
	case PPC_MSG_CALL_FUNCTION:
		smp_call_function_interrupt();
		break;
	case PPC_MSG_RESCHEDULE: 
		current->need_resched = 1;
		break;
	case PPC_MSG_INVALIDATE_TLB:
		_tlbia();
		break;
#ifdef CONFIG_XMON
	case PPC_MSG_XMON_BREAK:
		xmon(regs);
		break;
#endif /* CONFIG_XMON */
	default:
		printk("SMP %d: smp_message_recv(): unknown msg %d\n",
		       smp_processor_id(), msg);
		break;
	}
}

/*
 * 750's don't broadcast tlb invalidates so
 * we have to emulate that behavior.
 *   -- Cort
 */
void smp_send_tlb_invalidate(int cpu)
{
	if ( PVR_VER(mfspr(PVR)) == 8 )
		smp_message_pass(MSG_ALL_BUT_SELF, PPC_MSG_INVALIDATE_TLB, 0, 0);
}

void smp_send_reschedule(int cpu)
{
	/*
	 * This is only used if `cpu' is running an idle task,
	 * so it will reschedule itself anyway...
	 *
	 * This isn't the case anymore since the other CPU could be
	 * sleeping and won't reschedule until the next interrupt (such
	 * as the timer).
	 *  -- Cort
	 */
	/* This is only used if `cpu' is running an idle task,
	   so it will reschedule itself anyway... */
	smp_message_pass(cpu, PPC_MSG_RESCHEDULE, 0, 0);
}

#ifdef CONFIG_XMON
void smp_send_xmon_break(int cpu)
{
	smp_message_pass(cpu, PPC_MSG_XMON_BREAK, 0, 0);
}
#endif /* CONFIG_XMON */

static void stop_this_cpu(void *dummy)
{
	__cli();
	while (1)
		;
}

void smp_send_stop(void)
{
	smp_call_function(stop_this_cpu, NULL, 1, 0);
	smp_num_cpus = 1;
}

/*
 * Structure and data for smp_call_function(). This is designed to minimise
 * static memory requirements. It also looks cleaner.
 * Stolen from the i386 version.
 */
static spinlock_t call_lock = SPIN_LOCK_UNLOCKED;

static struct call_data_struct {
	void (*func) (void *info);
	void *info;
	atomic_t started;
	atomic_t finished;
	int wait;
} *call_data;

/*
 * this function sends a 'generic call function' IPI to all other CPUs
 * in the system.
 */

int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
			int wait)
/*
 * [SUMMARY] Run a function on all other CPUs.
 * <func> The function to run. This must be fast and non-blocking.
 * <info> An arbitrary pointer to pass to the function.
 * <nonatomic> currently unused.
 * <wait> If true, wait (atomically) until function has completed on other CPUs.
 * [RETURNS] 0 on success, else a negative status code. Does not return until
 * remote CPUs are nearly ready to execute <<func>> or are or have executed.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler, you may call it from a bottom half handler.
 */
{
	struct call_data_struct data;
	int ret = -1, cpus = smp_num_cpus-1;
	int timeout;

	if (!cpus)
		return 0;

	data.func = func;
	data.info = info;
	atomic_set(&data.started, 0);
	data.wait = wait;
	if (wait)
		atomic_set(&data.finished, 0);

	spin_lock_bh(&call_lock);
	call_data = &data;
	/* Send a message to all other CPUs and wait for them to respond */
	smp_message_pass(MSG_ALL_BUT_SELF, PPC_MSG_CALL_FUNCTION, 0, 0);

	/* Wait for response */
	timeout = 1000000;
	while (atomic_read(&data.started) != cpus) {
		if (--timeout == 0) {
			printk("smp_call_function on cpu %d: other cpus not responding (%d)\n",
			       smp_processor_id(), atomic_read(&data.started));
			goto out;
		}
		barrier();
		udelay(1);
	}

	if (wait) {
		timeout = 1000000;
		while (atomic_read(&data.finished) != cpus) {
			if (--timeout == 0) {
				printk("smp_call_function on cpu %d: other cpus not finishing (%d/%d)\n",
				       smp_processor_id(), atomic_read(&data.finished), atomic_read(&data.started));
				goto out;
			}
			barrier();
			udelay(1);
		}
	}
	ret = 0;

 out:
	spin_unlock_bh(&call_lock);
	return ret;
}

void smp_call_function_interrupt(void)
{
	void (*func) (void *info) = call_data->func;
	void *info = call_data->info;
	int wait = call_data->wait;

	/*
	 * Notify initiating CPU that I've grabbed the data and am
	 * about to execute the function
	 */
	atomic_inc(&call_data->started);
	/*
	 * At this point the info structure may be out of scope unless wait==1
	 */
	(*func)(info);
	if (wait)
		atomic_inc(&call_data->finished);
}

void __init smp_boot_cpus(void)
{
	extern struct task_struct *current_set[NR_CPUS];
	int i, cpu_nr;
	struct task_struct *p;

	printk("Entering SMP Mode...\n");
	smp_num_cpus = 1;
        smp_store_cpu_info(0);
	cpu_online_map = 1UL;

	/*
	 * assume for now that the first cpu booted is
	 * cpu 0, the master -- Cort
	 */
	cpu_callin_map[0] = 1;
	current->processor = 0;

	init_idle();

	for (i = 0; i < NR_CPUS; i++) {
		prof_counter[i] = 1;
		prof_multiplier[i] = 1;
	}

	/*
	 * XXX very rough, assumes 20 bus cycles to read a cache line,
	 * timebase increments every 4 bus cycles, 32kB L1 data cache.
	 */
	cacheflush_time = 5 * 1024;

	smp_ops = ppc_md.smp_ops;
	if (smp_ops == NULL) {
		printk("SMP not supported on this machine.\n");
		return;
	}

	/* Probe arch for CPUs */
	cpu_nr = smp_ops->probe();

	/*
	 * only check for cpus we know exist.  We keep the callin map
	 * with cpus at the bottom -- Cort
	 */
	if (cpu_nr > max_cpus)
		cpu_nr = max_cpus;
	for (i = 1; i < cpu_nr; i++) {
		int c;
		struct pt_regs regs;
		
		/* create a process for the processor */
		/* only regs.msr is actually used, and 0 is OK for it */
		memset(&regs, 0, sizeof(struct pt_regs));
		if (do_fork(CLONE_VM|CLONE_PID, 0, &regs, 0) < 0)
			panic("failed fork for CPU %d", i);
		p = init_task.prev_task;
		if (!p)
			panic("No idle task for CPU %d", i);
		del_from_runqueue(p);
		unhash_process(p);
		init_tasks[i] = p;

		p->processor = i;
		p->cpus_runnable = 1 << i; /* we schedule the first task manually */
		current_set[i] = p;

		/*
		 * There was a cache flush loop here to flush the cache
		 * to memory for the first 8MB of RAM.  The cache flush
		 * has been pushed into the kick_cpu function for those
		 * platforms that need it.
		 */

		/* wake up cpus */
		smp_ops->kick_cpu(i);
		
		/*
		 * wait to see if the cpu made a callin (is actually up).
		 * use this value that I found through experimentation.
		 * -- Cort
		 */
		for ( c = 1000; c && !cpu_callin_map[i] ; c-- )
			udelay(100);
		
		if ( cpu_callin_map[i] )
		{
			char buf[32];
			sprintf(buf, "found cpu %d", i);
			if (ppc_md.progress) ppc_md.progress(buf, 0x350+i);
			printk("Processor %d found.\n", i);
			smp_num_cpus++;
		} else {
			char buf[32];
			sprintf(buf, "didn't find cpu %d", i);
			if (ppc_md.progress) ppc_md.progress(buf, 0x360+i);
			printk("Processor %d is stuck.\n", i);
		}
	}

	/* Setup CPU 0 last (important) */
	smp_ops->setup_cpu(0);
	
	if (smp_num_cpus < 2)
		smp_tb_synchronized = 1;
}

void __init smp_software_tb_sync(int cpu)
{
#define PASSES 4	/* 4 passes.. */
	int pass;
	int i, j;

	/* stop - start will be the number of timebase ticks it takes for cpu0
	 * to send a message to all others and the first reponse to show up.
	 *
	 * ASSUMPTION: this time is similiar for all cpus
	 * ASSUMPTION: the time to send a one-way message is ping/2
	 */
	register unsigned long start = 0;
	register unsigned long stop = 0;
	register unsigned long temp = 0;

	set_tb(0, 0);

	/* multiple passes to get in l1 cache.. */
	for (pass = 2; pass < 2+PASSES; pass++){
		if (cpu == 0){
			mb();
			for (i = j = 1; i < smp_num_cpus; i++, j++){
				/* skip stuck cpus */
				while (!cpu_callin_map[j])
					++j;
				while (cpu_callin_map[j] != pass)
					barrier();
			}
			mb();
			tb_sync_flag = pass;
			start = get_tbl();	/* start timing */
			while (tb_sync_flag)
				mb();
			stop = get_tbl();	/* end timing */
			/* theoretically, the divisor should be 2, but
			 * I get better results on my dual mtx. someone
			 * please report results on other smp machines..
			 */
			tb_offset = (stop-start)/4;
			mb();
			tb_sync_flag = pass;
			udelay(10);
			mb();
			tb_sync_flag = 0;
			mb();
			set_tb(0,0);
			mb();
		} else {
			cpu_callin_map[cpu] = pass;
			mb();
			while (!tb_sync_flag)
				mb();		/* wait for cpu0 */
			mb();
			tb_sync_flag = 0;	/* send response for timing */
			mb();
			while (!tb_sync_flag)
				mb();
			temp = tb_offset;	/* make sure offset is loaded */
			while (tb_sync_flag)
				mb();
			set_tb(0,temp);		/* now, set the timebase */
			mb();
		}
	}
	if (cpu == 0) {
		smp_tb_synchronized = 1;
		printk("smp_software_tb_sync: %d passes, final offset: %ld\n",
			PASSES, tb_offset);
	}
	/* so time.c doesn't get confused */
	set_dec(tb_ticks_per_jiffy);
	last_jiffy_stamp(cpu) = 0;
}

void __init smp_commence(void)
{
	/*
	 *	Lets the callin's below out of their loop.
	 */
	if (ppc_md.progress) ppc_md.progress("smp_commence", 0x370);
	wmb();
	smp_commenced = 1;

	/* if the smp_ops->setup_cpu function has not already synched the
	 * timebases with a nicer hardware-based method, do so now
	 *
	 * I am open to suggestions for improvements to this method
	 * -- Troy <hozer@drgw.net>
	 *
	 * NOTE: if you are debugging, set smp_tb_synchronized for now
	 * since if this code runs pretty early and needs all cpus that
	 * reported in in smp_callin_map to be working
	 *
	 * NOTE2: this code doesn't seem to work on > 2 cpus. -- paulus/BenH
	 */
	if (!smp_tb_synchronized && smp_num_cpus == 2) {
		unsigned long flags;
		__save_and_cli(flags);	
		smp_software_tb_sync(0);
		__restore_flags(flags);
	}
}

void __init smp_callin(void)
{
	int cpu = current->processor;
	
        smp_store_cpu_info(cpu);
	set_dec(tb_ticks_per_jiffy);
	cpu_callin_map[cpu] = 1;

	smp_ops->setup_cpu(cpu);

	/*
	 * This cpu is now "online".  Only set them online
	 * before they enter the loop below since write access
	 * to the below variable is _not_ guaranteed to be
	 * atomic.
	 *   -- Cort <cort@fsmlabs.com>
	 */
	cpu_online_map |= 1UL << smp_processor_id();
	
	while(!smp_commenced)
		barrier();

	/* see smp_commence for more info */
	if (!smp_tb_synchronized && smp_num_cpus == 2) {
		smp_software_tb_sync(cpu);
	}
	__sti();
}

/* intel needs this */
void __init initialize_secondary(void)
{
}

/* Activate a secondary processor. */
int __init start_secondary(void *unused)
{
	atomic_inc(&init_mm.mm_count);
	current->active_mm = &init_mm;
	smp_callin();
	return cpu_idle(NULL);
}

void __init smp_setup(char *str, int *ints)
{
}

int __init setup_profiling_timer(unsigned int multiplier)
{
	return 0;
}

void __init smp_store_cpu_info(int id)
{
        struct cpuinfo_PPC *c = &cpu_data[id];

	/* assume bogomips are same for everything */
        c->loops_per_jiffy = loops_per_jiffy;
        c->pvr = mfspr(PVR);
}

static int __init maxcpus(char *str)
{
	get_option(&str, &max_cpus);
	return 1;
}

__setup("maxcpus=", maxcpus);