Linux内核线程与临界区内的线程切换错误

2016-09-12 #kernel分析

此处讨论的内核版本是linux-2.6.19 for mips-32。

此内核为分时系统，调度时间为1s。

1. 内核线程 exp1

uint32 index = 0;

int test_thread_A(void *data)
{
	while(1)
	{
		printk("A: %u\n", index);
		index++;
	}
	
	return 1;
}

int test_thread_B(void *data)
{
	while(1)
	{
		printk("B: %u\n", index);
		index++;
	}
	
	return 1;
}

void Create_My_Test_Thread(void)
{
	struct task_struct *test_task_A, *test_task_B;

	test_task_A = kthread_create(test_thread_A, NULL, "test_task");

	if(IS_ERR(test_task_A))
	{ 
	  printk("Unable to start kernel thread.\n");
	  test_task_A = NULL;
	  return;
	}

	test_task_B = kthread_create(test_thread_B, NULL, "test_task");

	if(IS_ERR(test_task_B))
	{ 
	  printk("Unable to start kernel thread.\n");
	  test_task_B = NULL;
	  return;
	}
	
	wake_up_process(test_task_A);
	wake_up_process(test_task_B);
	
	return;
}

结果：

......
A:0
A:1
......
B:1578
B:1579
......
A:3679
A:3680
......

两个内核线程定时被调度，一切正常。

2. 内核线程 exp2

uint32 index = 0;

int test_thread_A(void *data)
{
	uint32 idx;
	
	while(1)
	{
		for(idx = 0; idx < 5; idx++)
		{
			printk("A-%u: %u\n", idx, index);
			index++;
			sleep(1);
		}
	}
	
	return 1;
}

int test_thread_B(void *data)
{
	uint32 idx;
	
	while(1)
	{
		for(idx = 0; idx < 5; idx++)
		{
			printk("B-%u: %u\n", idx, index);
			index++;
			sleep(1);
		}
	}
	
	return 1;
}

void Create_My_Test_Thread(void)
{
	struct task_struct *test_task_A,*test_task_B;

	test_task_A = kthread_create(test_thread_A, NULL, "test_task");

	if(IS_ERR(test_task_A))
	{ 
	  printk("Unable to start kernel thread.\n");
	  test_task_A = NULL;
	  return;
	}

	test_task_B = kthread_create(test_thread_B, NULL, "test_task");

	if(IS_ERR(test_task_B))
	{ 
	  printk("Unable to start kernel thread.\n");
	  test_task_B = NULL;
	  return;
	}
	
	wake_up_process(test_task_A);
	wake_up_process(test_task_B);
	
	return;
}

结果：

......
A-0: 3343
B-0: 3344
......
A-2: 3345
B-2: 3346
......
A-4: 3347
B-4: 3348
......

现在加入sleep函数,由于sleep会导致内核线程睡眠，因此线程会主动schedule。

3. 内核线程 exp3

现在我希望在每个线程执行for循环的时候不允许被打断，因此我需要给这段加锁，以防止线程被切换。

我选择的是spinlock，锁被占用时忙等，并不会切换，因此在临界区的代码不会被打断。

也就是说临界区代码是原子操作。

#include <linux/spinlock.h>
spinlock_t spinlock = SPIN_LOCK_UNLOCKED;
spin_lock_init(&spinlock);

uint32 index = 0;

int test_thread_A(void *data)
{
	uint32 idx;
	
	while(1)
	{
		spin_lock(spinlock);
		
		for(idx = 0; idx < 5; idx++)
		{
			printk("A-%u: %u\n", idx, index);
			index++;
			osal_time_mdelay(1000);
		}

		spin_unlock(spinlock);
	}
	
	return 1;
}

int test_thread_B(void *data)
{
	uint32 idx;
	
	while(1)
	{
		spin_lock(spinlock);
		
		for(idx = 0; idx < 5; idx++)
		{
			printk("B-%u: %u\n", idx, index);
			index++;
			osal_time_mdelay(1000);
		}

		spin_unlock(spinlock);
	}
	
	return 1;
}

void Create_My_Test_Thread(void)
{
	struct task_struct *test_task_A,*test_task_B;

	test_task_A = kthread_create(test_thread_A, NULL, "test_task");

	if(IS_ERR(test_task_A))
	{ 
	  printk("Unable to start kernel thread.\n");
	  test_task_A = NULL;
	  return;
	}

	test_task_B = kthread_create(test_thread_B, NULL, "test_task");

	if(IS_ERR(test_task_B))
	{ 
	  printk("Unable to start kernel thread.\n");
	  test_task_B = NULL;
	  return;
	}
	
	wake_up_process(test_task_A);
	wake_up_process(test_task_B);
	
	return;
}

结果：

......
A-0: 210
BUG: scheduling while atomic: test_task/0x00000001/78
Call Trace:
[<80009038>] dump_stack+0x8/0x34
[<801fd238>] schedule+0x6c/0xc08
[<801fefc0>] schedule_timeout+0xa8/0xe0
[<801fe830>] interruptible_sleep_on_timeout+0xa8/0x15c
[<c00ce524>] osal_time_sleep+0x50/0x9c [rtcore]
[<c00b4334>] test_thread_A+0x64/0xa8 [custom_poe]
[<800434bc>] kthread+0x1c4/0x210
[<800042b8>] kernel_thread_helper+0x10/0x18

B-0: 211
BUG: scheduling while atomic: test_task/0x00000001/79
Call Trace:
[<80009038>] dump_stack+0x8/0x34
[<801fd238>] schedule+0x6c/0xc08
[<801fefc0>] schedule_timeout+0xa8/0xe0
[<801fe830>] interruptible_sleep_on_timeout+0xa8/0x15c
[<c00ce524>] osal_time_sleep+0x50/0x9c [rtcore]
[<c00b43dc>] test_thread_B+0x64/0xa8 [custom_poe]
[<800434bc>] kthread+0x1c4/0x210
[<800042b8>] kernel_thread_helper+0x10/0x18

A-1: 212
BUG: scheduling while atomic: test_task/0x00000001/78
Call Trace:
[<80009038>] dump_stack+0x8/0x34
[<801fd238>] schedule+0x6c/0xc08
[<801fefc0>] schedule_timeout+0xa8/0xe0
[<801fe830>] interruptible_sleep_on_timeout+0xa8/0x15c
[<c00ce524>] osal_time_sleep+0x50/0x9c [rtcore]
[<c00b4334>] test_thread_A+0x64/0xa8 [custom_poe]
[<800434bc>] kthread+0x1c4/0x210
[<800042b8>] kernel_thread_helper+0x10/0x18

B-1: 213
BUG: scheduling while atomic: test_task/0x00000001/79
Call Trace:
[<80009038>] dump_stack+0x8/0x34
[<801fd238>] schedule+0x6c/0xc08
[<801fefc0>] schedule_timeout+0xa8/0xe0
[<801fe830>] interruptible_sleep_on_timeout+0xa8/0x15c
[<c00ce524>] osal_time_sleep+0x50/0x9c [rtcore]
[<c00b43dc>] test_thread_B+0x64/0xa8 [custom_poe]
[<800434bc>] kthread+0x1c4/0x210
[<800042b8>] kernel_thread_helper+0x10/0x18

A-2: 214
BUG: scheduling while atomic: test_task/0x00000001/78
Call Trace:
[<80009038>] dump_stack+0x8/0x34
[<801fd238>] schedule+0x6c/0xc08
[<801fefc0>] schedule_timeout+0xa8/0xe0
[<801fe830>] interruptible_sleep_on_timeout+0xa8/0x15c
[<c00ce524>] osal_time_sleep+0x50/0x9c [rtcore]
[<c00b4334>] test_thread_A+0x64/0xa8 [custom_poe]
[<800434bc>] kthread+0x1c4/0x210
[<800042b8>] kernel_thread_helper+0x10/0x18

B-2: 215
BUG: scheduling while atomic: test_task/0x00000001/79
Call Trace:
[<80009038>] dump_stack+0x8/0x34
[<801fd238>] schedule+0x6c/0xc08
[<801fefc0>] schedule_timeout+0xa8/0xe0
[<801fe830>] interruptible_sleep_on_timeout+0xa8/0x15c
[<c00ce524>] osal_time_sleep+0x50/0x9c [rtcore]
[<c00b43dc>] test_thread_B+0x64/0xa8 [custom_poe]
[<800434bc>] kthread+0x1c4/0x210
[<800042b8>] kernel_thread_helper+0x10/0x18
......

以下转载至BUG: scheduling while atomic 分析

linux内核打印”BUG: scheduling while atomic”和”bad: scheduling from the idle thread”错误的时候，通常是在中断处理函数中调用了可以休眠的函数，如semaphore,mutex,sleep之类的可休眠的函数。

而linux内核要求在中断处理的时候，不允许系统调度，不允许抢占，要等到中断处理完成才能做其他事情。因此，要充分考虑中断处理的时间，一定不能太久。

另外一个能产生此问题的是在idle进程里面，做了不该做的事情。现在Linux用于很多手持式设备，为了降低功耗，通常的作法是在idle进程里面降低CPU或RAM的频率、关闭一些设备等等。要保证这些动作的原子性才能确保不发生”bad: scheduling from the idle thread”这样的错误！

禁止内核抢占是指内核不会主动的抢占你的process，但是现在是你在自己的程序中主动call schedule()，kernel并不能阻止你这么作。

所谓中断处理函数其实就相当于此时的临界区，都不允许线程切换，不管是内核操作还是自愿，只要有schedule内核就会报错，sleep函数恰好主动call schedule()，导致错误。

4. 内核线程 exp4

#include <linux/spinlock.h>
spinlock_t spinlock = SPIN_LOCK_UNLOCKED;
spin_lock_init(&spinlock);

uint32 index = 0;

int test_thread_A(void *data)
{
	uint32 idx;
	
	while(1)
	{
		spin_lock(spinlock);
		
		for(idx = 0; idx < 5; idx++)
		{
			printk("A-%u: %u\n", idx, index);
			index++;
			osal_time_mdelay(1000);
		}

		spin_unlock(spinlock);
	}
	
	return 1;
}

int test_thread_B(void *data)
{
	uint32 idx;
	
	while(1)
	{
		spin_lock(spinlock);
		
		for(idx = 0; idx < 5; idx++)
		{
			printk("B-%u: %u\n", idx, index);
			index++;
			osal_time_mdelay(1000);
		}

		spin_unlock(spinlock);
	}
	
	return 1;
}

void Create_My_Test_Thread(void)
{
	struct task_struct *test_task_A,*test_task_B;

	test_task_A = kthread_create(test_thread_A, NULL, "test_task");

	if(IS_ERR(test_task_A))
	{ 
	  printk("Unable to start kernel thread.\n");
	  test_task_A = NULL;
	  return;
	}

	test_task_B = kthread_create(test_thread_B, NULL, "test_task");

	if(IS_ERR(test_task_B))
	{ 
	  printk("Unable to start kernel thread.\n");
	  test_task_B = NULL;
	  return;
	}
	
	wake_up_process(test_task_A);
	wake_up_process(test_task_B);
	
	return;
}

结果

......
A-0: 3980
A-1: 3981
A-2: 3982
A-3: 3983
A-4: 3984
B-0: 3985
B-1: 3986
B-2: 3987
B-3: 3988
B-4: 3989
A-0: 3990
A-1: 3991
A-2: 3992
A-3: 3993
A-4: 3994
......

临界区没有内核或自愿的线程切换，执行正确。