diff -urN linux-2.6.3.orig/CREDITS linux-2.6.3/CREDITS --- linux-2.6.3.orig/CREDITS Thu Feb 19 01:26:06 2004 +++ linux-2.6.3/CREDITS Sun Feb 22 00:34:22 2004 @@ -2012,6 +2012,10 @@ S: Halifax, Nova Scotia S: Canada B3J 3C8 +N: Toshiyuki Maeda +E: tosh@is.s.u-tokyo.ac.jp +D: Kernel Mode Linux + N: Kai Mäkisara E: Kai.Makisara@kolumbus.fi D: SCSI Tape Driver diff -urN linux-2.6.3.orig/Documentation/00-INDEX linux-2.6.3/Documentation/00-INDEX --- linux-2.6.3.orig/Documentation/00-INDEX Thu Feb 19 01:25:59 2004 +++ linux-2.6.3/Documentation/00-INDEX Sun Feb 22 00:34:22 2004 @@ -116,6 +116,8 @@ - listing of various WWW + books that document kernel internals. kernel-parameters.txt - summary listing of command line / boot prompt args for the kernel. +kml.txt + - info on Kernel Mode Linux. ldm.txt - a brief description of LDM (Windows Dynamic Disks). locks.txt diff -urN linux-2.6.3.orig/Documentation/kml.txt linux-2.6.3/Documentation/kml.txt --- linux-2.6.3.orig/Documentation/kml.txt Thu Jan 1 09:00:00 1970 +++ linux-2.6.3/Documentation/kml.txt Sun Feb 22 00:34:22 2004 @@ -0,0 +1,93 @@ +Kernel Mode Linux (http://web.yl.is.s.u-tokyo.ac.jp/~tosh/kml) +Toshiyuki Maeda + + +Introduction: + +Kernel Mode Linux is a technology which enables us to execute user programs +in a kernel mode. In Kernel Mode Linux, user programs can be executed as +user processes that have the privilege level of a kernel mode. +The benefit of executing user programs in a kernel mode +is that the user programs can access a kernel address space directly. +So, for example, user programs can invoke +system calls very fast because it is unnecessary to switch between a kernel +mode and a user mode by using costly software interruptions or context switches. +Unlike kernel modules, user programs are executed +as ordinary processes (except for their privilege level), +so scheduling and paging are performed as usual. + +Although it seems dangerous to let user programs access a kernel directly, +safety of the kernel can be ensured, for example, by static type checking, +software fault isolation, and so forth. +For proof of concept, we are developing a system which is based on the combination +of Kernel Mode Linux and Typed Assembly Language, TAL. +(TAL can ensure safety of programs through its type checking and +the type checking can be done at machine binary level. +For more information about TAL, see http://www.cs.cornell.edu/talc) + + +Note: + +Currently, only IA-32 is supported. +Programs executed in a kernel mode shouldn't modify their CS, DS, FS and SS register. +If modified, the system will be in an undefined state. + + +Instruction: + +To enable Kernel Mode Linux, say Y in Kernel Mode Linux field of +kernel configuration, build and install the kernel, and reboot your machine. +Then, all executables under directory /trusted are executed in a kernel mode +in current Kernel Mode Linux implementation. For example, to execute a program +named "cat" in a kernel mode, copy the program to directory /trusted +and execute it as follows: + +% /trusted/cat + + +Implementation for IA-32: + +To execute user programs in a kernel mode, Kernel Mode Linux have +special start_thread (start_kernel_thread) routine, +which is called in execve(2) and set registers +of a user process to specified initial values. The original start_thread +routine set CS segment register to __USER_CS. The start_kernel_thread routine +set the CS register to __KERNEL_CS. +Thus, a user program is started as a user process executed in a kernel mode. + +The biggest problem to implement Kernel Mode Linux is +a stack starvation problem. Let's assume that a user program is executed +in a kernel mode and it does a page fault on its user stack. +To generate a page fault exception, a IA-32 CPU tries to push several +registers (EIP, CS, and so on) to the same user stack because the program +is executed in a kernel mode and the IA-32 CPU doesn't switch its stack +to a kernel stack. Therefore, the IA-32 CPU cannot push the registers +and generate a double fault exception and fail again. +Finally the IA-32 CPU gives up and reset itself. +This is the stack starvation problem. + +To solve the stack starvation problem, we use IA-32 hardware task mechanism to +handle exceptions. By using IA-32 task, IA-32 CPU doesn't push the registers +to its stack but switch an execution context to special contexts. +Therefore, the stack starvation problem doesn't occur. +However, it is costly to handle all exceptions by IA-32 tasks. +So, in current Kernel Mode Linux implementation, +only a double fault exception is handled by IA-32 task. + +The other problem is a manual stack switching problem. +In normal Linux Kernel, IA-32 CPU switches a stack from a user stack +to a kernel stack at exceptions or interruptions. +However, in Kernel Mode Linux, a user program may be executed in a kernel mode +and IA-32 CPU may not switch a stack. Therefore, +in current Kernel Mode Linux implementation, the kernel switches a stack +manually at exceptions and interruptions. To switch a stack, +a kernel must know a location of a kernel stack in an address space. +However, at exceptions and interruptions, the kernel cannot use +general registers (EAX, EBX, and so on). Therefore, it is very difficult +to get the location of the kernel stack. + +To solve the above problem, current Kernel Mode Linux implementation +exploits a per CPU GDT. In Kernel Mode Linux, one segment descriptor of +the per CPU GDT entries directly points to the location of the pointer to +the kernel stack in a TSS. Thus, by using the segment descriptor, the address +of the kernel stack can be available with only one general register. diff -urN linux-2.6.3.orig/MAINTAINERS linux-2.6.3/MAINTAINERS --- linux-2.6.3.orig/MAINTAINERS Thu Feb 19 01:26:06 2004 +++ linux-2.6.3/MAINTAINERS Sun Feb 22 00:34:22 2004 @@ -1186,6 +1186,12 @@ W: http://developer.osdl.org/rddunlap/kj-patches/ S: Maintained +KERNEL MODE LINUX +P: Toshiyuki Maeda +M: tosh@is.s.u-tokyo.ac.jp +W: http://www.yl.is.s.u-tokyo.ac.jp/~tosh/kml/ +S: Maintained + KERNEL NFSD P: Neil Brown M: neilb@cse.unsw.edu.au diff -urN linux-2.6.3.orig/Makefile linux-2.6.3/Makefile --- linux-2.6.3.orig/Makefile Thu Feb 19 01:26:06 2004 +++ linux-2.6.3/Makefile Sun Feb 22 00:34:22 2004 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 3 -EXTRAVERSION = +EXTRAVERSION = -kml NAME=Feisty Dunnart # *DOCUMENTATION* diff -urN linux-2.6.3.orig/arch/i386/Kconfig linux-2.6.3/arch/i386/Kconfig --- linux-2.6.3.orig/arch/i386/Kconfig Thu Feb 19 01:25:35 2004 +++ linux-2.6.3/arch/i386/Kconfig Sun Feb 22 00:34:22 2004 @@ -1168,6 +1168,29 @@ source "arch/i386/oprofile/Kconfig" +if X86_WP_WORKS_OK + +menu "Kernel Mode Linux" + +config KERNEL_MODE_LINUX + bool "Kernel Mode Linux" + ---help--- + This enables Kernel Mode Linux. In Kernel Mode Linux, user programs + can be executed safely in a kernel mode and access a kernel address space + directly. Thus, for example, costly mode switching between a user and a kernel + can be eliminated. If you say Y here, the kernel enables Kernel Mode Linux. + + More information about Kernel Mode Linux can be found in the + + + If you don't know what to do here, say N. + +comment "Safety check have not been implemented" +depends on KERNEL_MODE_LINUX + +endmenu + +endif menu "Kernel hacking" diff -urN linux-2.6.3.orig/arch/i386/kernel/Makefile linux-2.6.3/arch/i386/kernel/Makefile --- linux-2.6.3.orig/arch/i386/kernel/Makefile Thu Feb 19 01:25:35 2004 +++ linux-2.6.3/arch/i386/kernel/Makefile Sun Feb 22 00:34:22 2004 @@ -39,8 +39,17 @@ # vsyscall.o contains the vsyscall DSO images as __initdata. # We must build both images before we can assemble it. # Note: kbuild does not track this dependency due to usage of .incbin -$(obj)/vsyscall.o: $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so -targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so) + +target_symbols = int80 sysenter kml +extra-y += vsyscall-int80.lds.s vsyscall-sysenter.lds.s vsyscall-kml.lds.s +lds-flags = -P -C -Ui386 +AFLAGS_vsyscall-int80.lds.o = $(lds-flags) +AFLAGS_vsyscall-sysenter.lds.o = $(lds-flags) +AFLAGS_vsyscall-kml.lds.o = $(lds-flags) + +shared_objs = $(foreach F,$(target_symbols),$(obj)/vsyscall-$F.so) +$(obj)/vsyscall.o: $(shared_objs) +targets += $(foreach F,$(target_symbols),vsyscall-$F.o vsyscall-$F.so) # The DSO images are built using a special linker script. quiet_cmd_syscall = SYSCALL $@ @@ -50,18 +59,22 @@ vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 SYSCFLAGS_vsyscall-sysenter.so = $(vsyscall-flags) SYSCFLAGS_vsyscall-int80.so = $(vsyscall-flags) +SYSCFLAGS_vsyscall-kml.so = $(vsyscall-flags) -$(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \ -$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE +$(shared_objs): \ +$(obj)/vsyscall-%.so: $(src)/vsyscall-%.lds.s $(obj)/vsyscall-%.o FORCE $(call if_changed,syscall) # We also create a special relocatable object that should mirror the symbol # table and layout of the linked DSO. With ld -R we can then refer to # these symbols in the kernel code rather than hand-coded addresses. -extra-y += vsyscall-syms.o -$(obj)/built-in.o: $(obj)/vsyscall-syms.o -$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o +extra-y += vsyscall-syms-sysenter.o vsyscall-syms-kml.o +$(obj)/built-in.o: $(obj)/vsyscall-syms-sysenter.o $(obj)/vsyscall-syms-kml.o +$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms-sysenter.o -R $(obj)/vsyscall-syms-kml.o -SYSCFLAGS_vsyscall-syms.o = -r -$(obj)/vsyscall-syms.o: $(src)/vsyscall.lds $(obj)/vsyscall-sysenter.o FORCE +SYSCFLAGS_vsyscall-syms-sysenter.o = -r +$(obj)/vsyscall-syms-sysenter.o: $(src)/vsyscall-sysenter.lds.s $(obj)/vsyscall-sysenter.o FORCE + $(call if_changed,syscall) +SYSCFLAGS_vsyscall-syms-kml.o = -r +$(obj)/vsyscall-syms-kml.o: $(src)/vsyscall-kml.lds.s $(obj)/vsyscall-kml.o FORCE $(call if_changed,syscall) diff -urN linux-2.6.3.orig/arch/i386/kernel/cpu/common.c linux-2.6.3/arch/i386/kernel/cpu/common.c --- linux-2.6.3.orig/arch/i386/kernel/cpu/common.c Thu Feb 19 01:25:35 2004 +++ linux-2.6.3/arch/i386/kernel/cpu/common.c Sun Feb 22 00:34:22 2004 @@ -463,6 +463,9 @@ int cpu = smp_processor_id(); struct tss_struct * t = init_tss + cpu; struct thread_struct *thread = ¤t->thread; +#ifdef CONFIG_KERNEL_MODE_LINUX + struct tss_struct* doublefault_tss = doublefault_tsses + cpu; +#endif if (test_and_set_bit(cpu, &cpu_initialized)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); @@ -517,7 +520,13 @@ load_LDT(&init_mm.context); /* Set up doublefault TSS pointer in the GDT */ +#ifndef CONFIG_KERNEL_MODE_LINUX __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); +#else + set_ksl_desc(cpu, &t->esp0); + init_doublefault(cpu); + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, doublefault_tss); +#endif cpu_gdt_table[cpu][GDT_ENTRY_DOUBLEFAULT_TSS].b &= 0xfffffdff; /* Clear %fs and %gs. */ diff -urN linux-2.6.3.orig/arch/i386/kernel/direct_call.h linux-2.6.3/arch/i386/kernel/direct_call.h --- linux-2.6.3.orig/arch/i386/kernel/direct_call.h Thu Jan 1 09:00:00 1970 +++ linux-2.6.3/arch/i386/kernel/direct_call.h Sun Feb 22 00:34:22 2004 @@ -0,0 +1,124 @@ +/* + * linux/arch/i386/kernel/direct_call.h + * + * Copyright (C) 2003 Toshiyuki Maeda + */ + +/* + * These are macros for making direct_call_table. + * + * This file should be included only from the "sys_call_table_maker.h" file. + */ + +#ifdef CONFIG_KERNEL_MODE_LINUX + +.macro direct_prepare_stack argnum +.if \argnum +addl $-(4 * \argnum), %esp +.else +addl $-4, %esp +.endif +.endm + +.macro direct_push_args argnum +.if \argnum +direct_push_args "(\argnum - 1)" +movl (12 + (\argnum - 1) * 4)(%ebp), %eax +movl %eax, ((\argnum - 1) * 4)(%esp) +.endif +.endm + +/* + * entry.S is compiled with the "-traditional" option. + * So, we perform an old-style concatenation instead of '##'! + */ +#define MAKE_DIRECTCALL(name, argnum, syscall_num) \ +.text; \ +ENTRY(direct_/**/name); \ + pushl %ebp; \ + movl %esp, %ebp; \ + movl %fs:0x0, %esp; \ +\ + direct_prepare_stack argnum; \ + direct_push_args argnum; \ +\ + call name; \ +\ + GET_THREAD_INFO(%edx); \ + leave; \ +\ + movl TI_FLAGS(%edx), %ecx; \ + testw $_TIF_ALLWORK_MASK, %cx; \ + jne 0f; \ + ret; \ +0:; \ + pushl %eax; \ + pushl %ebx; \ + pushl %edi; \ + pushl %esi; \ + pushl %ebp; \ + movl $(syscall_num), %eax; \ + jmp direct_exit_work_/**/argnum; + +#define MAKE_DIRECTCALL_SPECIAL(name, argnum, syscall_num) \ +.text; \ +ENTRY(direct_/**/name); \ + pushl %ebx; \ + pushl %edi; \ + pushl %esi; \ + pushl %ebp; \ + add $-4, %esp; \ +\ + movl $(syscall_num), %eax; \ +\ + call direct_special_work_/**/argnum; \ +\ + pushfl; \ + pushl %cs; \ + pushl $direct_wrapper_int_post; \ + jmp system_call; + +direct_wrapper_int_pre: + int $0x80 +direct_wrapper_int_post: + addl $4, %esp + popl %ebp + popl %esi + popl %edi + popl %ebx + ret + +direct_exit_work_6: + movl 48(%esp), %ebp +direct_exit_work_5: + movl 44(%esp), %edi +direct_exit_work_4: + movl 40(%esp), %esi +direct_exit_work_3: + movl 36(%esp), %edx +direct_exit_work_2: + movl 32(%esp), %ecx +direct_exit_work_1: + movl 28(%esp), %ebx +direct_exit_work_0: + pushfl + pushl %cs + pushl $direct_wrapper_int_post + jmp kml_exit_work + +direct_special_work_6: + movl 52(%esp), %ebp +direct_special_work_5: + movl 48(%esp), %edi +direct_special_work_4: + movl 44(%esp), %esi +direct_special_work_3: + movl 40(%esp), %edx +direct_special_work_2: + movl 36(%esp), %ecx +direct_special_work_1: + movl 32(%esp), %ebx +direct_special_work_0: + ret + +#endif diff -urN linux-2.6.3.orig/arch/i386/kernel/doublefault.c linux-2.6.3/arch/i386/kernel/doublefault.c --- linux-2.6.3.orig/arch/i386/kernel/doublefault.c Thu Feb 19 01:25:35 2004 +++ linux-2.6.3/arch/i386/kernel/doublefault.c Sun Feb 22 00:34:22 2004 @@ -8,13 +8,20 @@ #include #include +#ifndef CONFIG_KERNEL_MODE_LINUX + #define DOUBLEFAULT_STACKSIZE (1024) static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; #define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) +#endif + #define ptr_ok(x) ((x) > 0xc0000000 && (x) < 0xc1000000) -static void doublefault_fn(void) +#ifndef CONFIG_KERNEL_MODE_LINUX +static +#endif +void doublefault_fn(void) { struct Xgt_desc_struct gdt_desc = {0, 0}; unsigned long gdt, tss; @@ -46,6 +53,8 @@ for (;;) /* nothing */; } +#ifndef CONFIG_KERNEL_MODE_LINUX + struct tss_struct doublefault_tss __cacheline_aligned = { .esp0 = STACK_START, .ss0 = __KERNEL_DS, @@ -62,3 +71,39 @@ .__cr3 = __pa(swapper_pg_dir) }; + +#else + +extern void double_fault_task(void); + +#define INIT_DFT { \ + .ss0 = __KERNEL_DS, \ + .ldt = 0, \ + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ + .eip = (unsigned long) double_fault_task, \ + .eflags = 0x00000082, \ + .es = __USER_DS, \ + .cs = __KERNEL_CS, \ + .ss = __KERNEL_DS, \ + .ds = __USER_DS \ +} + +struct tss_struct doublefault_tsses[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = INIT_DFT }; +struct dft_stack_struct dft_stacks[NR_CPUS] __cacheline_aligned; + +void init_doublefault(int cpu) +{ + struct tss_struct* tss = init_tss + cpu; + struct tss_struct* doublefault_tss = doublefault_tsses + cpu; + struct dft_stack_struct* dft_stack = dft_stacks + cpu; + + doublefault_tss->esp = (unsigned long)(&(dft_stack->error_code) + 1); + doublefault_tss->esp0 = doublefault_tss->esp; + dft_stack->current_tss = doublefault_tss; + dft_stack->previous_tss = tss; + + tss->ldt = GDT_ENTRY_LDT << 3; + doublefault_tss->ldt = GDT_ENTRY_LDT << 3; +} + +#endif diff -urN linux-2.6.3.orig/arch/i386/kernel/entry.S linux-2.6.3/arch/i386/kernel/entry.S --- linux-2.6.3.orig/arch/i386/kernel/entry.S Thu Feb 19 01:25:35 2004 +++ linux-2.6.3/arch/i386/kernel/entry.S Sun Feb 22 00:36:33 2004 @@ -63,6 +63,14 @@ ORIG_EAX = 0x24 EIP = 0x28 CS = 0x2C +#ifdef CONFIG_KERNEL_MODE_LINUX +/* + * CS_HW is used as stack switch indicator. + * If CS_HW is non-zero, stack switch occured. + * That is, we were in Kernel-User mode before interruption. + */ +CS_HW = 0x2E +#endif EFLAGS = 0x30 OLDESP = 0x34 OLDSS = 0x38 @@ -128,6 +136,7 @@ .previous +#ifndef CONFIG_KERNEL_MODE_LINUX #define RESTORE_ALL \ RESTORE_REGS \ addl $4, %esp; \ @@ -144,10 +153,204 @@ .align 4; \ .long 1b,2b; \ .previous +#else +#define RESTORE_ALL \ + RESTORE_REGS \ + addl $4, %esp; \ +/* Switch stack KK -> KU. */ \ + /* check whether if stack switch occured or not */ \ + cmpw $0x0, 6(%esp); \ + jne iret_to_ku; \ +restore_all_iret: \ + iret; \ +.section .fixup,"ax"; \ +restore_all_exit: \ + sti; \ + movl $(__USER_DS), %edx; \ + movl %edx, %ds; \ + movl %edx, %es; \ + pushl $11; \ + call do_exit; \ +.previous; \ +.section __ex_table,"a";\ + .align 4; \ + .long restore_all_iret, restore_all_exit; \ +.previous \ +ENTRY(iret_to_ku) \ + /* restore kernel-mode user process XCS == __KERNEL_CS */ \ + movl $__KERNEL_CS, 4(%esp); \ + pushl %ebp; \ + /* check whether if we can skip iret */ \ + movl 12(%esp), %ebp; \ + testl $~(0x240fd7), %ebp; \ + movl 16(%esp), %ebp; \ + jz skip_iret; \ + addl $-16, %ebp; \ +iret_to_ku_mov_ebp: \ + popl (%ebp); \ +iret_to_ku_mov_eip: \ + popl 4(%ebp); \ +iret_to_ku_mov_cs: \ + popl 8(%ebp); \ +iret_to_ku_mov_eflags: \ + popl 12(%ebp); \ + movl %ebp, %esp; \ +iret_to_ku_pop_ebp: \ + popl %ebp; \ +iret_to_ku_iret: \ + iret; \ +.section __ex_table,"a"; \ + .align 4; \ + .long iret_to_ku_mov_ebp, restore_all_exit; \ + .long iret_to_ku_mov_eip, restore_all_exit; \ + .long iret_to_ku_mov_cs, restore_all_exit; \ + .long iret_to_ku_mov_eflags, restore_all_exit; \ + .long iret_to_ku_pop_ebp, restore_all_exit; \ + .long iret_to_ku_iret, restore_all_exit; \ +.previous \ +ENTRY(skip_iret) \ + addl $-12, %ebp; \ +skip_iret_mov_ebp: \ + popl (%ebp); \ +skip_iret_mov_eip: \ + popl 8(%ebp); \ + addl $4, %esp; \ +skip_iret_mov_eflags: \ + popl 4(%ebp); \ + movl %ebp, %esp; \ +skip_iret_pop_ebp: \ + popl %ebp; \ +skip_iret_pop_eflags: \ + popfl; \ +skip_iret_ret: \ + ret; \ +.section __ex_table,"a";\ + .align 4; \ + .long skip_iret_mov_ebp, restore_all_exit; \ + .long skip_iret_mov_eip, restore_all_exit; \ + .long skip_iret_mov_eflags, restore_all_exit; \ + .long skip_iret_pop_ebp, restore_all_exit; \ + .long skip_iret_pop_eflags, restore_all_exit; \ + .long skip_iret_ret, restore_all_exit; \ +.previous +#endif + +#ifndef CONFIG_KERNEL_MODE_LINUX +#define SWITCH_STACK_TO_KK +#define SWITCH_STACK_TO_KK_WITH_ERROR_CODE +#define SWITCH_STACK_TO_KK_LCALL +#else + +#define TASK_SIZE (__PAGE_OFFSET) +#define __SW_KM_USER_CS (0xffff0000 | __USER_CS) + +/* + * This is a macro for stack switching. + */ +#define SWITCH_STACK_TO_KK \ + /* Check whether if we were in Kernel-User mode or not. */ \ + cmpl $(TASK_SIZE), %esp; \ + /* For ancient processors, clear stack switch in XCS */ \ + /* because they don't clear High 16 bits of XCS. */ \ + movw $0x0, 6(%esp); \ + ja 2f; \ + /* \ + * We were in Kernel-User mode. \ + * Therefore, XCS == __KERNEL_CS. \ + * Thus, we can safely overwrite XCS. \ + */ \ + movl %ebp, 4(%esp); /* save %ebp to XCS */ \ + movl %ds, %ebp; \ + cmpw $(__KERNELSTACK_DS), %bp; \ + movl $(__KERNELSTACK_DS), %ebp; \ + movl %ebp, %ds; \ + movl %esp, %ebp; \ + movl (0x0), %esp; \ + je 1f; \ + pushl $(__USER_DS); \ + popl %ds; \ +1: \ + addl $12, %ebp; \ + addl $-4, %esp; /* XSS */ \ + pushl %ebp; /* ESP */ \ + pushl -4(%ebp); /* EFLAGS */ \ + pushl $(__SW_KM_USER_CS); /* XCS */ \ + pushl -12(%ebp); /* EIP */ \ + movl -8(%ebp), %ebp; /* restore %ebp from XCS */ \ +2: +/* + * This is as same as the SWITCH_STACK_TO_KK + * but handles an error code on a stack + */ +#define SWITCH_STACK_TO_KK_WITH_ERROR_CODE \ + cmpl $(TASK_SIZE), %esp; \ + movw $0x0, 10(%esp); /* clear stack switch in XCS, sigh... */ \ + ja 2f; \ + /* \ + * We are in Kernel-User mode. \ + * Therefore, XCS == __KERNEL_CS. \ + */ \ + movl %ebp, 8(%esp); /* save %ebp to XCS */ \ + movl %ds, %ebp; \ + cmpw $(__KERNELSTACK_DS), %bp; \ + movl $(__KERNELSTACK_DS), %ebp; \ + movl %ebp, %ds; \ + movl %esp, %ebp; \ + movl (0x0), %esp; \ + je 1f; \ + pushl $(__USER_DS); \ + popl %ds; \ +1: \ + addl $16, %ebp; \ + addl $-4, %esp; /* XSS */ \ + pushl %ebp; /* ESP */ \ + pushl -4(%ebp); /* EFLAGS */ \ + pushl $(__SW_KM_USER_CS); /* XCS */ \ + pushl -12(%ebp); /* EIP */ \ + pushl -16(%ebp); /* error_code */ \ + movl -8(%ebp), %ebp; /* restore %ebp from XCS */ \ +2: + +/* + * This is as same as the SWITCH_STACK_TO_KK + * but handles lcall + */ +#define SWITCH_STACK_TO_KK_LCALL \ + /* Check whether if we were in Kernel-User mode or not. */ \ + cmpl $(TASK_SIZE), %esp; \ + /* For ancient processors, clear stack switch in XCS */ \ + /* because they don't clear High 16 bits of XCS. */ \ + movw $0x0, 6(%esp); \ + ja 2f; \ + /* \ + * We were in Kernel-User mode. \ + * Therefore, XCS == __KERNEL_CS. \ + * Thus, we can safely overwrite XCS. \ + */ \ + movl %ebp, 4(%esp); /* save %ebp to XCS */ \ + movl %ds, %ebp; \ + cmpw $(__KERNELSTACK_DS), %bp; \ + movl $(__KERNELSTACK_DS), %ebp; \ + movl %ebp, %ds; \ + movl %esp, %ebp; \ + movl (0x0), %esp; \ + je 1f; \ + pushl $(__USER_DS); \ + popl %ds; \ +1: \ + addl $8, %ebp; \ + addl $-4, %esp; /* XSS */ \ + pushl %ebp; /* ESP */ \ + pushl $(__SW_KM_USER_CS); /* XCS */ \ + pushl -8(%ebp); /* EIP */ \ + movl -4(%ebp), %ebp; /* restore %ebp from XCS */ \ +2: +#endif ENTRY(lcall7) + SWITCH_STACK_TO_KK_LCALL pushfl # We get a different stack layout with call # gates, which has to be cleaned up later.. pushl %eax @@ -170,6 +373,7 @@ jmp resume_userspace ENTRY(lcall27) + SWITCH_STACK_TO_KK_LCALL pushfl # We get a different stack layout with call # gates, which has to be cleaned up later.. pushl %eax @@ -278,9 +482,9 @@ sti sysexit - # system call handler stub ENTRY(system_call) + SWITCH_STACK_TO_KK pushl %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) @@ -388,10 +592,12 @@ ENTRY(irq_entries_start) .rept NR_IRQS ALIGN -1: pushl $vector-256 +0: /* local label "1", "2" and "3" are used in SWITCH_STACK_TO_KK ! */ + SWITCH_STACK_TO_KK + pushl $vector-256 jmp common_interrupt .data - .long 1b + .long 0b .text vector=vector+1 .endr @@ -404,15 +610,17 @@ #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ + SWITCH_STACK_TO_KK \ pushl $nr-256; \ SAVE_ALL \ - call smp_/**/name; \ + call smp_/**/name; \ jmp ret_from_intr; /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" ENTRY(divide_error) + SWITCH_STACK_TO_KK pushl $0 # no error code pushl $do_divide_error ALIGN @@ -444,16 +652,19 @@ jmp ret_from_exception ENTRY(coprocessor_error) + SWITCH_STACK_TO_KK pushl $0 pushl $do_coprocessor_error jmp error_code ENTRY(simd_coprocessor_error) + SWITCH_STACK_TO_KK pushl $0 pushl $do_simd_coprocessor_error jmp error_code ENTRY(device_not_available) + SWITCH_STACK_TO_KK pushl $-1 # mark this as an int SAVE_ALL movl %cr0, %eax @@ -491,6 +702,7 @@ pushl $sysenter_past_esp ENTRY(debug) + SWITCH_STACK_TO_KK cmpl $sysenter_entry,(%esp) jne debug_stack_correct FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) @@ -508,6 +720,7 @@ * fault happened on the sysenter path. */ ENTRY(nmi) + SWITCH_STACK_TO_KK cmpl $sysenter_entry,(%esp) je nmi_stack_fixup pushl %eax @@ -529,7 +742,7 @@ pushl %edx call do_nmi addl $8, %esp - RESTORE_ALL + jmp restore_all nmi_stack_fixup: FIX_STACK(12,nmi_stack_correct, 1) @@ -546,341 +759,476 @@ jmp nmi_stack_correct ENTRY(int3) + SWITCH_STACK_TO_KK pushl $0 pushl $do_int3 jmp error_code ENTRY(overflow) + SWITCH_STACK_TO_KK pushl $0 pushl $do_overflow jmp error_code ENTRY(bounds) + SWITCH_STACK_TO_KK pushl $0 pushl $do_bounds jmp error_code ENTRY(invalid_op) + SWITCH_STACK_TO_KK pushl $0 pushl $do_invalid_op jmp error_code ENTRY(coprocessor_segment_overrun) + SWITCH_STACK_TO_KK pushl $0 pushl $do_coprocessor_segment_overrun jmp error_code +#ifdef CONFIG_KERNEL_MODE_LINUX + +PAGE_FAULT_ERROR_CODE = 0x2 +TSS_CR3 = 28 +TSS_EIP = 32 +TSS_EFLAGS = 36 +TSS_CS = 76 +TSS_ESP = 56 +TSS_SS = 80 + +/* + * This is a task-handler for double fault. + * In Kernel Mode Linux, user programs may be executed in ring 0 (kernel mode). + * Therefore, normal interruption handling mechanism doesn't work. + * For example, if a page fault occurs in a stack, + * CPU cannot generate a page fault exception because there is no stack + * to save the CPU context. We call this problem "stack starvation". + * To solve the stack starvation, we handle double fault with task-handler. + */ +ENTRY(double_fault_task) + movl 4(%esp), %edi # get current TSS. +/* %edi = current_tss */ + movl 8(%esp), %ebx # get previous TSS. +/* %ebx = prev_tss */ + + # get kernel stack. + cmpw $__KERNEL_CS, TSS_CS(%ebx) + jne 1f + movl TSS_ESP(%ebx), %esi + cmpl $TASK_SIZE, %esi + ja 2f +1: + movl $(__KERNELSTACK_DS), %eax + movl %eax, %ds + movl (0x0), %esi + movl $(__USER_DS), %eax + movl %eax, %ds +2: + movl %esi, %esp +/* From now on, we can use stack. */ + + # recreate stack layout as if normal interruption occurs. + cmpw $__KERNEL_CS, TSS_CS(%ebx) + jne 3f + movl TSS_ESP(%ebx), %esi + cmpl $TASK_SIZE, %esi + ja 4f +3: + pushl TSS_SS(%ebx) + pushl TSS_ESP(%ebx) + + movl TSS_ESP(%ebx), %esi +4: + pushl TSS_EFLAGS(%ebx) + pushl TSS_CS(%ebx) + pushl TSS_EIP(%ebx) + + movw $0x0, 6(%esp) + cmpw $__KERNEL_CS, TSS_CS(%ebx) + jne 5f + cmpl $TASK_SIZE, %esi + ja 5f + /* record stack switch in XCS */ + movl $__SW_KM_USER_CS, 4(%esp) +5: + + # check whether if stack starvation occured or not. +/* %esi = prev_tss->esp */ + # calling address_presents_and_writable + addl $-4, %esi /* %esi = prev_tss->esp - 4 */ + addl $-12, %esp + pushl %esi + call address_presents_and_writable + addl $16, %esp + + testl %eax, %eax + jne 7f +6: + pushl $PAGE_FAULT_ERROR_CODE + movl $page_fault_no_stack_switch, TSS_EIP(%ebx) + andb $253, 37(%ebx) /* == andl $~IF_MASK, TSS_EFLAGS(%ebx) */ + movl %esi, %eax + movl %eax, %cr2 + jmp 9f +7: + addl $-12, %esi /* %esi = prev_tss->esp - 16 */ + addl $-12, %esp + pushl %esi + call address_presents_and_writable + addl $16, %esp + + testl %eax, %eax + jne 8f + jmp 6b +8: +/* + pushl $0 + movl $double_fault_no_stack_switch, TSS_EIP(%ebx) +*/ + jmp doublefault_fn +9: + andb $254, 37(%ebx) /* == andl $~TF_MASK, TSS_EFLAGS(%ebx) */ + movw $__KERNEL_CS, TSS_CS(%ebx) + movl %esp, TSS_ESP(%ebx) + movw $__KERNEL_DS, TSS_SS(%ebx) + + movl TSS_CR3(%edi), %eax + movl %eax, TSS_CR3(%ebx) + + movl TSS_ESP(%edi), %esp + + iret + jmp double_fault_task +#endif + ENTRY(invalid_TSS) + SWITCH_STACK_TO_KK_WITH_ERROR_CODE pushl $do_invalid_TSS jmp error_code ENTRY(segment_not_present) + SWITCH_STACK_TO_KK_WITH_ERROR_CODE pushl $do_segment_not_present jmp error_code ENTRY(stack_segment) + SWITCH_STACK_TO_KK_WITH_ERROR_CODE pushl $do_stack_segment jmp error_code ENTRY(general_protection) + SWITCH_STACK_TO_KK_WITH_ERROR_CODE pushl $do_general_protection jmp error_code ENTRY(alignment_check) + SWITCH_STACK_TO_KK_WITH_ERROR_CODE pushl $do_alignment_check jmp error_code ENTRY(page_fault) + SWITCH_STACK_TO_KK_WITH_ERROR_CODE pushl $do_page_fault jmp error_code +#ifdef CONFIG_KERNEL_MODE_LINUX +ENTRY(page_fault_no_stack_switch) + pushl $do_page_fault + jmp error_code +#endif + #ifdef CONFIG_X86_MCE ENTRY(machine_check) + SWITCH_STACK_TO_KK pushl $0 pushl machine_check_vector jmp error_code #endif ENTRY(spurious_interrupt_bug) + SWITCH_STACK_TO_KK pushl $0 pushl $do_spurious_interrupt_bug jmp error_code -.data -ENTRY(sys_call_table) - .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ - .long sys_exit - .long sys_fork - .long sys_read - .long sys_write - .long sys_open /* 5 */ - .long sys_close - .long sys_waitpid - .long sys_creat - .long sys_link - .long sys_unlink /* 10 */ - .long sys_execve - .long sys_chdir - .long sys_time - .long sys_mknod - .long sys_chmod /* 15 */ - .long sys_lchown16 - .long sys_ni_syscall /* old break syscall holder */ - .long sys_stat - .long sys_lseek - .long sys_getpid /* 20 */ - .long sys_mount - .long sys_oldumount - .long sys_setuid16 - .long sys_getuid16 - .long sys_stime /* 25 */ - .long sys_ptrace - .long sys_alarm - .long sys_fstat - .long sys_pause - .long sys_utime /* 30 */ - .long sys_ni_syscall /* old stty syscall holder */ - .long sys_ni_syscall /* old gtty syscall holder */ - .long sys_access - .long sys_nice - .long sys_ni_syscall /* 35 - old ftime syscall holder */ - .long sys_sync - .long sys_kill - .long sys_rename - .long sys_mkdir - .long sys_rmdir /* 40 */ - .long sys_dup - .long sys_pipe - .long sys_times - .long sys_ni_syscall /* old prof syscall holder */ - .long sys_brk /* 45 */ - .long sys_setgid16 - .long sys_getgid16 - .long sys_signal - .long sys_geteuid16 - .long sys_getegid16 /* 50 */ - .long sys_acct - .long sys_umount /* recycled never used phys() */ - .long sys_ni_syscall /* old lock syscall holder */ - .long sys_ioctl - .long sys_fcntl /* 55 */ - .long sys_ni_syscall /* old mpx syscall holder */ - .long sys_setpgid - .long sys_ni_syscall /* old ulimit syscall holder */ - .long sys_olduname - .long sys_umask /* 60 */ - .long sys_chroot - .long sys_ustat - .long sys_dup2 - .long sys_getppid - .long sys_getpgrp /* 65 */ - .long sys_setsid - .long sys_sigaction - .long sys_sgetmask - .long sys_ssetmask - .long sys_setreuid16 /* 70 */ - .long sys_setregid16 - .long sys_sigsuspend - .long sys_sigpending - .long sys_sethostname - .long sys_setrlimit /* 75 */ - .long sys_old_getrlimit - .long sys_getrusage - .long sys_gettimeofday - .long sys_settimeofday - .long sys_getgroups16 /* 80 */ - .long sys_setgroups16 - .long old_select - .long sys_symlink - .long sys_lstat - .long sys_readlink /* 85 */ - .long sys_uselib - .long sys_swapon - .long sys_reboot - .long old_readdir - .long old_mmap /* 90 */ - .long sys_munmap - .long sys_truncate - .long sys_ftruncate - .long sys_fchmod - .long sys_fchown16 /* 95 */ - .long sys_getpriority - .long sys_setpriority - .long sys_ni_syscall /* old profil syscall holder */ - .long sys_statfs - .long sys_fstatfs /* 100 */ - .long sys_ioperm - .long sys_socketcall - .long sys_syslog - .long sys_setitimer - .long sys_getitimer /* 105 */ - .long sys_newstat - .long sys_newlstat - .long sys_newfstat - .long sys_uname - .long sys_iopl /* 110 */ - .long sys_vhangup - .long sys_ni_syscall /* old "idle" system call */ - .long sys_vm86old - .long sys_wait4 - .long sys_swapoff /* 115 */ - .long sys_sysinfo - .long sys_ipc - .long sys_fsync - .long sys_sigreturn - .long sys_clone /* 120 */ - .long sys_setdomainname - .long sys_newuname - .long sys_modify_ldt - .long sys_adjtimex - .long sys_mprotect /* 125 */ - .long sys_sigprocmask - .long sys_ni_syscall /* old "create_module" */ - .long sys_init_module - .long sys_delete_module - .long sys_ni_syscall /* 130: old "get_kernel_syms" */ - .long sys_quotactl - .long sys_getpgid - .long sys_fchdir - .long sys_bdflush - .long sys_sysfs /* 135 */ - .long sys_personality - .long sys_ni_syscall /* reserved for afs_syscall */ - .long sys_setfsuid16 - .long sys_setfsgid16 - .long sys_llseek /* 140 */ - .long sys_getdents - .long sys_select - .long sys_flock - .long sys_msync - .long sys_readv /* 145 */ - .long sys_writev - .long sys_getsid - .long sys_fdatasync - .long sys_sysctl - .long sys_mlock /* 150 */ - .long sys_munlock - .long sys_mlockall - .long sys_munlockall - .long sys_sched_setparam - .long sys_sched_getparam /* 155 */ - .long sys_sched_setscheduler - .long sys_sched_getscheduler - .long sys_sched_yield - .long sys_sched_get_priority_max - .long sys_sched_get_priority_min /* 160 */ - .long sys_sched_rr_get_interval - .long sys_nanosleep - .long sys_mremap - .long sys_setresuid16 - .long sys_getresuid16 /* 165 */ - .long sys_vm86 - .long sys_ni_syscall /* Old sys_query_module */ - .long sys_poll - .long sys_nfsservctl - .long sys_setresgid16 /* 170 */ - .long sys_getresgid16 - .long sys_prctl - .long sys_rt_sigreturn - .long sys_rt_sigaction - .long sys_rt_sigprocmask /* 175 */ - .long sys_rt_sigpending - .long sys_rt_sigtimedwait - .long sys_rt_sigqueueinfo - .long sys_rt_sigsuspend - .long sys_pread64 /* 180 */ - .long sys_pwrite64 - .long sys_chown16 - .long sys_getcwd - .long sys_capget - .long sys_capset /* 185 */ - .long sys_sigaltstack - .long sys_sendfile - .long sys_ni_syscall /* reserved for streams1 */ - .long sys_ni_syscall /* reserved for streams2 */ - .long sys_vfork /* 190 */ - .long sys_getrlimit - .long sys_mmap2 - .long sys_truncate64 - .long sys_ftruncate64 - .long sys_stat64 /* 195 */ - .long sys_lstat64 - .long sys_fstat64 - .long sys_lchown - .long sys_getuid - .long sys_getgid /* 200 */ - .long sys_geteuid - .long sys_getegid - .long sys_setreuid - .long sys_setregid - .long sys_getgroups /* 205 */ - .long sys_setgroups - .long sys_fchown - .long sys_setresuid - .long sys_getresuid - .long sys_setresgid /* 210 */ - .long sys_getresgid - .long sys_chown - .long sys_setuid - .long sys_setgid - .long sys_setfsuid /* 215 */ - .long sys_setfsgid - .long sys_pivot_root - .long sys_mincore - .long sys_madvise - .long sys_getdents64 /* 220 */ - .long sys_fcntl64 - .long sys_ni_syscall /* reserved for TUX */ - .long sys_ni_syscall - .long sys_gettid - .long sys_readahead /* 225 */ - .long sys_setxattr - .long sys_lsetxattr - .long sys_fsetxattr - .long sys_getxattr - .long sys_lgetxattr /* 230 */ - .long sys_fgetxattr - .long sys_listxattr - .long sys_llistxattr - .long sys_flistxattr - .long sys_removexattr /* 235 */ - .long sys_lremovexattr - .long sys_fremovexattr - .long sys_tkill - .long sys_sendfile64 - .long sys_futex /* 240 */ - .long sys_sched_setaffinity - .long sys_sched_getaffinity - .long sys_set_thread_area - .long sys_get_thread_area - .long sys_io_setup /* 245 */ - .long sys_io_destroy - .long sys_io_getevents - .long sys_io_submit - .long sys_io_cancel - .long sys_fadvise64 /* 250 */ - .long sys_ni_syscall - .long sys_exit_group - .long sys_lookup_dcookie - .long sys_epoll_create - .long sys_epoll_ctl /* 255 */ - .long sys_epoll_wait - .long sys_remap_file_pages - .long sys_set_tid_address - .long sys_timer_create - .long sys_timer_settime /* 260 */ - .long sys_timer_gettime - .long sys_timer_getoverrun - .long sys_timer_delete - .long sys_clock_settime - .long sys_clock_gettime /* 265 */ - .long sys_clock_getres - .long sys_clock_nanosleep - .long sys_statfs64 - .long sys_fstatfs64 - .long sys_tgkill /* 270 */ - .long sys_utimes - .long sys_fadvise64_64 - .long sys_ni_syscall /* sys_vserver */ +#include "sys_call_table_maker.h" +SYSCALL_TABLE_BEGIN + SYSCALL_ENTRY(sys_restart_syscall,0) /* 0 - old "setup()" system call, used for restarting */ + SYSCALL_ENTRY(sys_exit,1) + SYSCALL_ENTRY_SPECIAL(sys_fork,0) + SYSCALL_ENTRY(sys_read,3) + SYSCALL_ENTRY(sys_write,3) + SYSCALL_ENTRY(sys_open,3) /* 5 */ + SYSCALL_ENTRY(sys_close,1) + SYSCALL_ENTRY(sys_waitpid,3) + SYSCALL_ENTRY(sys_creat,2) + SYSCALL_ENTRY(sys_link,2) + SYSCALL_ENTRY(sys_unlink,1) /* 10 */ + SYSCALL_ENTRY_SPECIAL(sys_execve,3) + SYSCALL_ENTRY(sys_chdir,1) + SYSCALL_ENTRY(sys_time,1) + SYSCALL_ENTRY(sys_mknod,3) + SYSCALL_ENTRY(sys_chmod,2) /* 15 */ + SYSCALL_ENTRY(sys_lchown16,3) + SYSCALL_ENTRY(sys_ni_syscall,0) /* old break syscall holder */ + SYSCALL_ENTRY(sys_stat,2) + SYSCALL_ENTRY(sys_lseek,3) + SYSCALL_ENTRY(sys_getpid,0) /* 20 */ + SYSCALL_ENTRY(sys_mount,5) + SYSCALL_ENTRY(sys_oldumount,1) + SYSCALL_ENTRY(sys_setuid16,1) + SYSCALL_ENTRY(sys_getuid16,0) + SYSCALL_ENTRY(sys_stime,1) /* 25 */ + SYSCALL_ENTRY(sys_ptrace,4) + SYSCALL_ENTRY(sys_alarm,1) + SYSCALL_ENTRY(sys_fstat,2) + SYSCALL_ENTRY(sys_pause,0) + SYSCALL_ENTRY(sys_utime,2) /* 30 */ + SYSCALL_ENTRY(sys_ni_syscall,0) /* old stty syscall holder */ + SYSCALL_ENTRY(sys_ni_syscall,0) /* old gtty syscall holder */ + SYSCALL_ENTRY(sys_access,2) + SYSCALL_ENTRY(sys_nice,1) + SYSCALL_ENTRY(sys_ni_syscall,0) /* 35 - old ftime syscall holder */ + SYSCALL_ENTRY(sys_sync,0) + SYSCALL_ENTRY(sys_kill,2) + SYSCALL_ENTRY(sys_rename,2) + SYSCALL_ENTRY(sys_mkdir,2) + SYSCALL_ENTRY(sys_rmdir,1) /* 40 */ + SYSCALL_ENTRY(sys_dup,1) + SYSCALL_ENTRY(sys_pipe,1) + SYSCALL_ENTRY(sys_times,1) + SYSCALL_ENTRY(sys_ni_syscall,0) /* old prof syscall holder */ + SYSCALL_ENTRY(sys_brk,1) /* 45 */ + SYSCALL_ENTRY(sys_setgid16,1) + SYSCALL_ENTRY(sys_getgid16,0) + SYSCALL_ENTRY(sys_signal,2) + SYSCALL_ENTRY(sys_geteuid16,0) + SYSCALL_ENTRY(sys_getegid16,0) /* 50 */ + SYSCALL_ENTRY(sys_acct,1) + SYSCALL_ENTRY(sys_umount,2) /* recycled never used phys() */ + SYSCALL_ENTRY(sys_ni_syscall,0) /* old lock syscall holder */ + SYSCALL_ENTRY(sys_ioctl,3) + SYSCALL_ENTRY(sys_fcntl,3) /* 55 */ + SYSCALL_ENTRY(sys_ni_syscall,0) /* old mpx syscall holder */ + SYSCALL_ENTRY(sys_setpgid,2) + SYSCALL_ENTRY(sys_ni_syscall,0) /* old ulimit syscall holder */ + SYSCALL_ENTRY(sys_olduname,1) + SYSCALL_ENTRY(sys_umask,1) /* 60 */ + SYSCALL_ENTRY(sys_chroot,1) + SYSCALL_ENTRY(sys_ustat,2) + SYSCALL_ENTRY(sys_dup2,2) + SYSCALL_ENTRY(sys_getppid,0) + SYSCALL_ENTRY(sys_getpgrp,0) /* 65 */ + SYSCALL_ENTRY(sys_setsid,0) + SYSCALL_ENTRY(sys_sigaction,3) + SYSCALL_ENTRY(sys_sgetmask,0) + SYSCALL_ENTRY(sys_ssetmask,1) + SYSCALL_ENTRY(sys_setreuid16,2) /* 70 */ + SYSCALL_ENTRY(sys_setregid16,2) + SYSCALL_ENTRY_SPECIAL(sys_sigsuspend,3) + SYSCALL_ENTRY(sys_sigpending,1) + SYSCALL_ENTRY(sys_sethostname,2) + SYSCALL_ENTRY(sys_setrlimit,2) /* 75 */ + SYSCALL_ENTRY(sys_old_getrlimit,2) + SYSCALL_ENTRY(sys_getrusage,2) + SYSCALL_ENTRY(sys_gettimeofday,2) + SYSCALL_ENTRY(sys_settimeofday,2) + SYSCALL_ENTRY(sys_getgroups16,2) /* 80 */ + SYSCALL_ENTRY(sys_setgroups16,2) + SYSCALL_ENTRY(old_select,1) + SYSCALL_ENTRY(sys_symlink,2) + SYSCALL_ENTRY(sys_lstat,2) + SYSCALL_ENTRY(sys_readlink,3) /* 85 */ + SYSCALL_ENTRY(sys_uselib,1) + SYSCALL_ENTRY(sys_swapon,2) + SYSCALL_ENTRY(sys_reboot,4) + SYSCALL_ENTRY(old_readdir,3) + SYSCALL_ENTRY(old_mmap,1) /* 90 */ + SYSCALL_ENTRY(sys_munmap,2) + SYSCALL_ENTRY(sys_truncate,2) + SYSCALL_ENTRY(sys_ftruncate,2) + SYSCALL_ENTRY(sys_fchmod,2) + SYSCALL_ENTRY(sys_fchown16,3) /* 95 */ + SYSCALL_ENTRY(sys_getpriority,2) + SYSCALL_ENTRY(sys_setpriority,3) + SYSCALL_ENTRY(sys_ni_syscall,0) /* old profil syscall holder */ + SYSCALL_ENTRY(sys_statfs,2) + SYSCALL_ENTRY(sys_fstatfs,2) /* 100 */ + SYSCALL_ENTRY(sys_ioperm,3) + SYSCALL_ENTRY(sys_socketcall,2) + SYSCALL_ENTRY(sys_syslog,3) + SYSCALL_ENTRY(sys_setitimer,3) + SYSCALL_ENTRY(sys_getitimer,2) /* 105 */ + SYSCALL_ENTRY(sys_newstat,2) + SYSCALL_ENTRY(sys_newlstat,2) + SYSCALL_ENTRY(sys_newfstat,2) + SYSCALL_ENTRY(sys_uname,1) + SYSCALL_ENTRY_SPECIAL(sys_iopl,1) /* 110 */ + SYSCALL_ENTRY(sys_vhangup,0) + SYSCALL_ENTRY(sys_ni_syscall,0) /* old "idle" system call */ + SYSCALL_ENTRY(sys_vm86old,1) /* XXX: KML compatibility not tested */ + SYSCALL_ENTRY(sys_wait4,4) + SYSCALL_ENTRY(sys_swapoff,1) /* 115 */ + SYSCALL_ENTRY(sys_sysinfo,1) + SYSCALL_ENTRY(sys_ipc,6) + SYSCALL_ENTRY(sys_fsync,1) + SYSCALL_ENTRY_SPECIAL(sys_sigreturn,0) + SYSCALL_ENTRY_SPECIAL(sys_clone,3) /* 120 */ + SYSCALL_ENTRY(sys_setdomainname,2) + SYSCALL_ENTRY(sys_newuname,1) + SYSCALL_ENTRY(sys_modify_ldt,3) + SYSCALL_ENTRY(sys_adjtimex,1) + SYSCALL_ENTRY(sys_mprotect,3) /* 125 */ + SYSCALL_ENTRY(sys_sigprocmask,3) + SYSCALL_ENTRY(sys_ni_syscall,0) /* old "create_module" */ + SYSCALL_ENTRY(sys_init_module,3) + SYSCALL_ENTRY(sys_delete_module,2) + SYSCALL_ENTRY(sys_ni_syscall,0) /* 130: old "get_kernel_syms" */ + SYSCALL_ENTRY(sys_quotactl,4) + SYSCALL_ENTRY(sys_getpgid,1) + SYSCALL_ENTRY(sys_fchdir,1) + SYSCALL_ENTRY(sys_bdflush,2) + SYSCALL_ENTRY(sys_sysfs,3) /* 135 */ + SYSCALL_ENTRY(sys_personality,1) + SYSCALL_ENTRY(sys_ni_syscall,0) /* reserved for afs_syscall */ + SYSCALL_ENTRY(sys_setfsuid16,1) + SYSCALL_ENTRY(sys_setfsgid16,1) + SYSCALL_ENTRY(sys_llseek,5) /* 140 */ + SYSCALL_ENTRY(sys_getdents,3) + SYSCALL_ENTRY(sys_select,5) + SYSCALL_ENTRY(sys_flock,2) + SYSCALL_ENTRY(sys_msync,3) + SYSCALL_ENTRY(sys_readv,3) /* 145 */ + SYSCALL_ENTRY(sys_writev,3) + SYSCALL_ENTRY(sys_getsid,1) + SYSCALL_ENTRY(sys_fdatasync,1) + SYSCALL_ENTRY(sys_sysctl,1) + SYSCALL_ENTRY(sys_mlock,2) /* 150 */ + SYSCALL_ENTRY(sys_munlock,2) + SYSCALL_ENTRY(sys_mlockall,1) + SYSCALL_ENTRY(sys_munlockall,0) + SYSCALL_ENTRY(sys_sched_setparam,2) + SYSCALL_ENTRY(sys_sched_getparam,2) /* 155 */ + SYSCALL_ENTRY(sys_sched_setscheduler,3) + SYSCALL_ENTRY(sys_sched_getscheduler,1) + SYSCALL_ENTRY(sys_sched_yield,0) + SYSCALL_ENTRY(sys_sched_get_priority_max,1) + SYSCALL_ENTRY(sys_sched_get_priority_min,1) /* 160 */ + SYSCALL_ENTRY(sys_sched_rr_get_interval,2) + SYSCALL_ENTRY(sys_nanosleep,2) + SYSCALL_ENTRY(sys_mremap,5) + SYSCALL_ENTRY(sys_setresuid16,3) + SYSCALL_ENTRY(sys_getresuid16,3) /* 165 */ + SYSCALL_ENTRY(sys_vm86,2) /* XXX: KML compatibility not tested */ + SYSCALL_ENTRY(sys_ni_syscall,0) /* Old sys_query_module */ + SYSCALL_ENTRY(sys_poll,3) + SYSCALL_ENTRY(sys_nfsservctl,3) + SYSCALL_ENTRY(sys_setresgid16,3) /* 170 */ + SYSCALL_ENTRY(sys_getresgid16,3) + SYSCALL_ENTRY(sys_prctl,5) + SYSCALL_ENTRY_SPECIAL(sys_rt_sigreturn,0) + SYSCALL_ENTRY(sys_rt_sigaction,4) + SYSCALL_ENTRY(sys_rt_sigprocmask,4) /* 175 */ + SYSCALL_ENTRY(sys_rt_sigpending,2) + SYSCALL_ENTRY(sys_rt_sigtimedwait,4) + SYSCALL_ENTRY(sys_rt_sigqueueinfo,3) + SYSCALL_ENTRY_SPECIAL(sys_rt_sigsuspend,2) + SYSCALL_ENTRY(sys_pread64,5) /* 180 */ + SYSCALL_ENTRY(sys_pwrite64,5) + SYSCALL_ENTRY(sys_chown16,3) + SYSCALL_ENTRY(sys_getcwd,2) + SYSCALL_ENTRY(sys_capget,2) + SYSCALL_ENTRY(sys_capset,2) /* 185 */ + SYSCALL_ENTRY_SPECIAL(sys_sigaltstack,2) + SYSCALL_ENTRY(sys_sendfile,4) + SYSCALL_ENTRY(sys_ni_syscall,0) /* reserved for streams1 */ + SYSCALL_ENTRY(sys_ni_syscall,0) /* reserved for streams2 */ + SYSCALL_ENTRY_SPECIAL(sys_vfork,0) /* 190 */ + SYSCALL_ENTRY(sys_getrlimit,2) + SYSCALL_ENTRY(sys_mmap2,6) + SYSCALL_ENTRY(sys_truncate64,3) + SYSCALL_ENTRY(sys_ftruncate64,3) + SYSCALL_ENTRY(sys_stat64,2) /* 195 */ + SYSCALL_ENTRY(sys_lstat64,2) + SYSCALL_ENTRY(sys_fstat64,2) + SYSCALL_ENTRY(sys_lchown,3) + SYSCALL_ENTRY(sys_getuid,0) + SYSCALL_ENTRY(sys_getgid,0) /* 200 */ + SYSCALL_ENTRY(sys_geteuid,0) + SYSCALL_ENTRY(sys_getegid,0) + SYSCALL_ENTRY(sys_setreuid,2) + SYSCALL_ENTRY(sys_setregid,2) + SYSCALL_ENTRY(sys_getgroups,2) /* 205 */ + SYSCALL_ENTRY(sys_setgroups,2) + SYSCALL_ENTRY(sys_fchown,3) + SYSCALL_ENTRY(sys_setresuid,3) + SYSCALL_ENTRY(sys_getresuid,3) + SYSCALL_ENTRY(sys_setresgid,3) /* 210 */ + SYSCALL_ENTRY(sys_getresgid,3) + SYSCALL_ENTRY(sys_chown,3) + SYSCALL_ENTRY(sys_setuid,1) + SYSCALL_ENTRY(sys_setgid,1) + SYSCALL_ENTRY(sys_setfsuid,1) /* 215 */ + SYSCALL_ENTRY(sys_setfsgid,1) + SYSCALL_ENTRY(sys_pivot_root,2) + SYSCALL_ENTRY(sys_mincore,3) + SYSCALL_ENTRY(sys_madvise,3) + SYSCALL_ENTRY(sys_getdents64,3) /* 220 */ + SYSCALL_ENTRY(sys_fcntl64,3) + SYSCALL_ENTRY(sys_ni_syscall,0) /* reserved for TUX */ + SYSCALL_ENTRY(sys_ni_syscall,0) + SYSCALL_ENTRY(sys_gettid,0) + SYSCALL_ENTRY(sys_readahead,4) /* 225 */ + SYSCALL_ENTRY(sys_setxattr,5) + SYSCALL_ENTRY(sys_lsetxattr,5) + SYSCALL_ENTRY(sys_fsetxattr,5) + SYSCALL_ENTRY(sys_getxattr,4) + SYSCALL_ENTRY(sys_lgetxattr,4) /* 230 */ + SYSCALL_ENTRY(sys_fgetxattr,4) + SYSCALL_ENTRY(sys_listxattr,3) + SYSCALL_ENTRY(sys_llistxattr,3) + SYSCALL_ENTRY(sys_flistxattr,3) + SYSCALL_ENTRY(sys_removexattr,2) /* 235 */ + SYSCALL_ENTRY(sys_lremovexattr,2) + SYSCALL_ENTRY(sys_fremovexattr,2) + SYSCALL_ENTRY(sys_tkill,2) + SYSCALL_ENTRY(sys_sendfile64,4) + SYSCALL_ENTRY(sys_futex,5) /* 240 */ + SYSCALL_ENTRY(sys_sched_setaffinity,3) + SYSCALL_ENTRY(sys_sched_getaffinity,3) + SYSCALL_ENTRY(sys_set_thread_area,1) + SYSCALL_ENTRY(sys_get_thread_area,1) + SYSCALL_ENTRY(sys_io_setup,2) /* 245 */ + SYSCALL_ENTRY(sys_io_destroy,1) + SYSCALL_ENTRY(sys_io_getevents,5) + SYSCALL_ENTRY(sys_io_submit,3) + SYSCALL_ENTRY(sys_io_cancel,3) + SYSCALL_ENTRY(sys_fadvise64,5) /* 250 */ + SYSCALL_ENTRY(sys_ni_syscall,0) + SYSCALL_ENTRY(sys_exit_group,1) + SYSCALL_ENTRY(sys_lookup_dcookie,4) + SYSCALL_ENTRY(sys_epoll_create,1) + SYSCALL_ENTRY(sys_epoll_ctl,4) /* 255 */ + SYSCALL_ENTRY(sys_epoll_wait,4) + SYSCALL_ENTRY(sys_remap_file_pages,5) + SYSCALL_ENTRY(sys_set_tid_address,1) + SYSCALL_ENTRY(sys_timer_create,3) + SYSCALL_ENTRY(sys_timer_settime,4) /* 260 */ + SYSCALL_ENTRY(sys_timer_gettime,2) + SYSCALL_ENTRY(sys_timer_getoverrun,1) + SYSCALL_ENTRY(sys_timer_delete,1) + SYSCALL_ENTRY(sys_clock_settime,2) + SYSCALL_ENTRY(sys_clock_gettime,2) /* 265 */ + SYSCALL_ENTRY(sys_clock_getres,2) + SYSCALL_ENTRY(sys_clock_nanosleep,4) + SYSCALL_ENTRY(sys_statfs64,3) + SYSCALL_ENTRY(sys_fstatfs64,3) + SYSCALL_ENTRY(sys_tgkill,3) /* 270 */ + SYSCALL_ENTRY(sys_utimes,2) + SYSCALL_ENTRY(sys_fadvise64_64,6) + SYSCALL_ENTRY(sys_ni_syscall,0) /* sys_vserver */ + syscall_table_size=(.-sys_call_table) diff -urN linux-2.6.3.orig/arch/i386/kernel/head.S linux-2.6.3/arch/i386/kernel/head.S --- linux-2.6.3.orig/arch/i386/kernel/head.S Thu Feb 19 01:25:35 2004 +++ linux-2.6.3/arch/i386/kernel/head.S Sun Feb 22 00:34:22 2004 @@ -480,7 +480,11 @@ .quad 0x0000000000000000 /* 0xd8 - unused */ .quad 0x0000000000000000 /* 0xe0 - unused */ .quad 0x0000000000000000 /* 0xe8 - unused */ +#ifndef CONFIG_KERNEL_MODE_LINUX .quad 0x0000000000000000 /* 0xf0 - unused */ +#else + .quad 0x0000000000000000 /* 0xf0 - Kernel Stack Location segment (KSL) set at runtime */ +#endif .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ #ifdef CONFIG_SMP diff -urN linux-2.6.3.orig/arch/i386/kernel/kml_call.h linux-2.6.3/arch/i386/kernel/kml_call.h --- linux-2.6.3.orig/arch/i386/kernel/kml_call.h Thu Jan 1 09:00:00 1970 +++ linux-2.6.3/arch/i386/kernel/kml_call.h Sun Feb 22 00:34:22 2004 @@ -0,0 +1,144 @@ +/* + * linux/arch/i386/kernel/kml_call.h + * + * Copyright (C) 2003 Toshiyuki Maeda + */ + +/* + * These are macros for making kml_call_table. + * + * This file should be included only from the "sys_call_table_maker.h" file. + */ + +#ifdef CONFIG_KERNEL_MODE_LINUX + +.macro kml_push_args argnum +.ifeq \argnum +addl $-4, %esp +.endif +.ifeq \argnum - 1 +pushl %ebx +.endif +.ifeq \argnum - 2 +pushl %ecx +kml_push_args 1 +.endif +.ifeq \argnum - 3 +pushl %edx +kml_push_args 2 +.endif +.ifeq \argnum - 4 +pushl %esi +kml_push_args 3 +.endif +.ifeq \argnum - 5 +pushl %edi +kml_push_args 4 +.endif +.ifeq \argnum - 6 +pushl (%ebp) +kml_push_args 5 +.endif +.endm + +/* + * entry.S is compiled with the "-traditional" option. + * So, we perform an old-style concatenation instead of '##'! + */ +#define MAKE_KMLCALL(name, argnum, syscall_num) \ +.ifndef kml_/**/argnum; \ +.text; \ +ENTRY(kml_/**/argnum); \ + pushl %eax; \ + pushl %edx; \ + pushl %ecx; \ + pushl %ebp; \ + movl %esp, %ebp; \ + movl %fs:0x0, %esp; \ +\ + kml_push_args argnum; \ +\ + leal sys_call_table(,%eax,4), %ecx; \ + call *(%ecx); \ +\ + GET_THREAD_INFO(%edx); \ + leave; \ +\ + movl TI_FLAGS(%edx), %ecx; \ + testw $_TIF_ALLWORK_MASK, %cx; \ + popl %ecx; \ + popl %edx; \ + jne 0f; \ + addl $4, %esp; \ + ret; \ +0:; \ + pushl %ecx; \ + movl 4(%esp), %ecx; \ + movl %eax, 4(%esp); \ + movl %ecx, %eax; \ + popl %ecx; \ + pushfl; \ + pushl %cs; \ + pushl $kml_wrapper_int_post; \ + jmp kml_exit_work; \ +.endif; \ +kml_/**/name = kml_/**/argnum + +#define MAKE_KMLCALL_SPECIAL(name, argnum, syscall_num) \ +kml_/**/name = kml_special + +ENTRY(kml_special) + add $-4, %esp + pushfl + pushl %cs + pushl $kml_wrapper_int_post + jmp system_call + +/* generic routines for kml call's exit */ +ENTRY(kml_exit_work) + SWITCH_STACK_TO_KK + + pushl %eax + SAVE_ALL + + movl OLDESP(%esp), %eax + movl (%eax), %eax + movl %eax,EAX(%esp) # store the return value + + GET_THREAD_INFO(%ebp) + jmp syscall_exit + +kml_wrapper_int_pre: + int $0x80 +kml_wrapper_int_post: + addl $4, %esp + ret + +ENTRY(kml_sigreturn_shortcut) + popl %eax + movl $119, %eax # 119 == __NR_sigreturn + jmp return_wrapper + +ENTRY(kml_rt_sigreturn_shortcut) + movl $173, %eax # 173 == __NR_rt_sigreturn +return_wrapper: + movl %fs, %edx + movl $__KERNELSTACK_DS, %ecx + movl %ecx, %fs + movl %esp, %ecx + movl %fs:0x0, %esp + movl %edx, %fs + + addl $-4, %esp # XSS + pushl %ecx # ESP + pushfl # EFLAGS + pushl $(__SW_KM_USER_CS) # XCS + addl $-4, %esp # EIP + + pushl %eax # orig_eax + addl $-36, %esp # SAVE_ALL + + GET_THREAD_INFO(%ebp) + jmp syscall_call + +#endif diff -urN linux-2.6.3.orig/arch/i386/kernel/signal.c linux-2.6.3/arch/i386/kernel/signal.c --- linux-2.6.3.orig/arch/i386/kernel/signal.c Thu Feb 19 01:25:35 2004 +++ linux-2.6.3/arch/i386/kernel/signal.c Sun Feb 22 00:34:22 2004 @@ -142,10 +142,26 @@ err |= __get_user(tmp, &sc->seg); \ regs->x##seg = tmp; } +#ifndef CONFIG_KERNEL_MODE_LINUX #define COPY_SEG_STRICT(seg) \ { unsigned short tmp; \ err |= __get_user(tmp, &sc->seg); \ regs->x##seg = tmp|3; } +#else +#define COPY_CS_STRICT \ + { unsigned long tmp; \ + unsigned long mask; \ + err |= __get_user(tmp, &sc->xcs); \ + mask = (regs->xcs == __SW_KM_USER_CS) ? 0 : (regs->xcs & 3); \ + regs->xcs = tmp | mask; } + +#define COPY_SS_STRICT \ + { unsigned short tmp; \ + unsigned long mask; \ + err |= __get_user(tmp, &sc->ss); \ + mask = (regs->xcs == __SW_KM_USER_CS) ? 0 : (regs->xcs & 3); \ + regs->xss = tmp | mask; } +#endif #define GET_SEG(seg) \ { unsigned short tmp; \ @@ -164,8 +180,13 @@ COPY(edx); COPY(ecx); COPY(eip); +#ifndef CONFIG_KERNEL_MODE_LINUX COPY_SEG_STRICT(cs); COPY_SEG_STRICT(ss); +#else + COPY_CS_STRICT; + COPY_SS_STRICT; +#endif { unsigned int tmpflags; @@ -290,7 +311,11 @@ err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); err |= __put_user(regs->eip, &sc->eip); +#ifndef CONFIG_KERNEL_MODE_LINUX err |= __put_user(regs->xcs, (unsigned int *)&sc->cs); +#else + err |= __put_user(regs->xcs, &sc->xcs); +#endif err |= __put_user(regs->eflags, &sc->eflags); err |= __put_user(regs->esp, &sc->esp_at_signal); err |= __put_user(regs->xss, (unsigned int *)&sc->ss); @@ -327,6 +352,9 @@ /* This is the legacy signal stack switching. */ else if ((regs->xss & 0xffff) != __USER_DS && +#ifdef CONFIG_KERNEL_MODE_LINUX + (regs->esp > TASK_SIZE) && +#endif !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { esp = (unsigned long) ka->sa.sa_restorer; @@ -339,6 +367,11 @@ See vsyscall-sigreturn.S. */ extern void __kernel_sigreturn, __kernel_rt_sigreturn; +#ifdef CONFIG_KERNEL_MODE_LINUX +extern void kml_sigreturn_shortcut(void); +extern void kml_rt_sigreturn_shortcut(void); +#endif + static void setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, struct pt_regs * regs) { @@ -371,8 +404,21 @@ if (err) goto give_sigsegv; +#ifndef CONFIG_KERNEL_MODE_LINUX restorer = &__kernel_sigreturn; +#else + if (regs->xcs != __SW_KM_USER_CS) { + restorer = &__kernel_sigreturn; + } else { + restorer = (void*) kml_sigreturn_shortcut; + } +#endif + +#ifndef CONFIG_KERNEL_MODE_LINUX if (ka->sa.sa_flags & SA_RESTORER) +#else + if ((ka->sa.sa_flags & SA_RESTORER) && (regs->xcs != __SW_KM_USER_CS)) +#endif restorer = ka->sa.sa_restorer; /* Set up to return from userspace. */ @@ -396,11 +442,27 @@ regs->esp = (unsigned long) frame; regs->eip = (unsigned long) ka->sa.sa_handler; +#ifndef CONFIG_KERNEL_MODE_LINUX set_fs(USER_DS); regs->xds = __USER_DS; regs->xes = __USER_DS; regs->xss = __USER_DS; regs->xcs = __USER_CS; +#else + if (regs->xcs == __SW_KM_USER_CS) { + set_fs(KERNEL_DS); + regs->xds = __USER_DS; + regs->xes = __USER_DS; + regs->xss = __KERNEL_DS; + regs->xcs = __SW_KM_USER_CS; + } else { + set_fs(USER_DS); + regs->xds = __USER_DS; + regs->xes = __USER_DS; + regs->xss = __USER_DS; + regs->xcs = __USER_CS; + } +#endif regs->eflags &= ~TF_MASK; #if DEBUG_SIG @@ -454,8 +516,21 @@ goto give_sigsegv; /* Set up to return from userspace. */ +#ifndef CONFIG_KERNEL_MODE_LINUX restorer = &__kernel_rt_sigreturn; +#else + if (regs->xcs != __SW_KM_USER_CS) { + restorer = &__kernel_rt_sigreturn; + } else { + restorer = (void*) kml_rt_sigreturn_shortcut; + } +#endif + +#ifndef CONFIG_KERNEL_MODE_LINUX if (ka->sa.sa_flags & SA_RESTORER) +#else + if ((ka->sa.sa_flags & SA_RESTORER) && (regs->xcs != __SW_KM_USER_CS)) +#endif restorer = ka->sa.sa_restorer; err |= __put_user(restorer, &frame->pretcode); @@ -477,11 +552,27 @@ regs->esp = (unsigned long) frame; regs->eip = (unsigned long) ka->sa.sa_handler; +#ifndef CONFIG_KERNEL_MODE_LINUX set_fs(USER_DS); regs->xds = __USER_DS; regs->xes = __USER_DS; regs->xss = __USER_DS; regs->xcs = __USER_CS; +#else + if (regs->xcs == __SW_KM_USER_CS) { + set_fs(KERNEL_DS); + regs->xds = __USER_DS; + regs->xes = __USER_DS; + regs->xss = __KERNEL_DS; + regs->xcs = __SW_KM_USER_CS; + } else { + set_fs(USER_DS); + regs->xds = __USER_DS; + regs->xes = __USER_DS; + regs->xss = __USER_DS; + regs->xcs = __USER_CS; + } +#endif regs->eflags &= ~TF_MASK; #if DEBUG_SIG diff -urN linux-2.6.3.orig/arch/i386/kernel/sys_call_table_maker.h linux-2.6.3/arch/i386/kernel/sys_call_table_maker.h --- linux-2.6.3.orig/arch/i386/kernel/sys_call_table_maker.h Thu Jan 1 09:00:00 1970 +++ linux-2.6.3/arch/i386/kernel/sys_call_table_maker.h Sun Feb 22 00:34:22 2004 @@ -0,0 +1,76 @@ +/* + * linux/arch/i386/kernel/sys_call_table_maker.h + * + * Copyright (C) 2002 Toshiyuki Maeda + */ + +/* + * These are macros for making sys_call_table. + * + * This file should be included only from the "entry.S" file. + */ + +#ifndef CONFIG_KERNEL_MODE_LINUX + +#define SYSCALL_TABLE_BEGIN \ +.data; \ +ENTRY(sys_call_table); + +#define SYSCALL_ENTRY(name,argnum) \ +.long name; + +#define SYSCALL_ENTRY_SPECIAL(name,argnum) \ +.long name; + +#else + +#include "kml_call.h" +#include "direct_call.h" + +#define SYSCALL_TABLE_BEGIN \ +SYSCALL_NUM=0; \ +.data 0; \ +ENTRY(sys_call_table); \ +.data 1; \ +ENTRY(kml_call_table); \ +.data 2; \ +ENTRY(direct_call_table); \ +.data 0; + +/* + * entry.S is compiled with the "-traditional" option. + * So, we perform an old-style concatenation instead of '##'! + */ +#define SYSCALL_ENTRY(name,argnum) \ +.data 0; \ +.long name; \ +.ifndef kml_/**/name; \ +MAKE_KMLCALL(name,argnum,SYSCALL_NUM); \ +.endif; \ +.data 1; \ +.long kml_/**/name; \ +.ifndef direct_/**/name; \ +MAKE_DIRECTCALL(name,argnum,SYSCALL_NUM); \ +.endif; \ +.data 2; \ +.long direct_/**/name; \ +.data 0; \ +SYSCALL_NUM=SYSCALL_NUM+1; + +#define SYSCALL_ENTRY_SPECIAL(name,argnum) \ +.data 0; \ +.long name; \ +.ifndef kml_/**/name; \ +MAKE_KMLCALL_SPECIAL(name,argnum,SYSCALL_NUM); \ +.endif; \ +.data 1; \ +.long kml_/**/name; \ +.ifndef direct_/**/name; \ +MAKE_DIRECTCALL_SPECIAL(name,argnum,SYSCALL_NUM); \ +.endif; \ +.data 2; \ +.long direct_/**/name; \ +.data 0; \ +SYSCALL_NUM=SYSCALL_NUM+1; + +#endif diff -urN linux-2.6.3.orig/arch/i386/kernel/sysenter.c linux-2.6.3/arch/i386/kernel/sysenter.c --- linux-2.6.3.orig/arch/i386/kernel/sysenter.c Thu Feb 19 01:25:35 2004 +++ linux-2.6.3/arch/i386/kernel/sysenter.c Sun Feb 22 00:34:22 2004 @@ -63,3 +63,41 @@ } __initcall(sysenter_setup); + +#ifdef CONFIG_KERNEL_MODE_LINUX +extern const char vsyscall_kml_start, vsyscall_kml_end; +extern const char kml_call_table; + +static void __init kml_call_table_fixup(void) +{ + int off; + unsigned long* to_be_filled; + + /* Patch the first instruction in vsysenter-kml.so */ + off = VSYSCALL_KML_ENTRY - VSYSCALL_KML_BASE; + to_be_filled = (unsigned long*)(&vsyscall_kml_start + off + 3); + *to_be_filled = (unsigned long)&kml_call_table; + + return; +} + +static int __init kml_setup(void) +{ + unsigned long page; + + kml_call_table_fixup(); + + page = get_zeroed_page(GFP_ATOMIC); + + __set_fixmap(FIX_VSYSCALL_KML, __pa(page), PAGE_KERNEL_RO); + + memcpy((void *) page, + &vsyscall_kml_start, + &vsyscall_kml_end - &vsyscall_kml_start); + + return 0; +} + +__initcall(kml_setup); +#endif + diff -urN linux-2.6.3.orig/arch/i386/kernel/vsyscall-common.lds linux-2.6.3/arch/i386/kernel/vsyscall-common.lds --- linux-2.6.3.orig/arch/i386/kernel/vsyscall-common.lds Thu Jan 1 09:00:00 1970 +++ linux-2.6.3/arch/i386/kernel/vsyscall-common.lds Sun Feb 22 00:34:22 2004 @@ -0,0 +1,46 @@ +/* + * Linker script for vsyscall DSO. The vsyscall page is an ELF shared + * object prelinked to its virtual address, and with only one read-only + * segment (that fits in one page). This script controls its layout. + */ + +SECTIONS +{ + . = VSYSCALL_BASE + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + /* This linker script is used both with -r and with -shared. + For the layouts to match, we need to skip more than enough + space for the dynamic symbol table et al. If this amount + is insufficient, ld -shared will barf. Just increase it here. */ + . = VSYSCALL_BASE + 0x400; + + .text : { *(.text) } :text =0x90909090 + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .dynamic : { *(.dynamic) } :text :dynamic + .useless : { + *(.got.plt) *(.got) + *(.data .data.* .gnu.linkonce.d.*) + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + } :text +} + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ +} diff -urN linux-2.6.3.orig/arch/i386/kernel/vsyscall-int80.lds.S linux-2.6.3/arch/i386/kernel/vsyscall-int80.lds.S --- linux-2.6.3.orig/arch/i386/kernel/vsyscall-int80.lds.S Thu Jan 1 09:00:00 1970 +++ linux-2.6.3/arch/i386/kernel/vsyscall-int80.lds.S Sun Feb 22 00:34:22 2004 @@ -0,0 +1,23 @@ + +/* This must match . */ +#define VSYSCALL_BASE 0xffffe000 + +#include "vsyscall-common.lds" + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + LINUX_2.5 { + global: + __kernel_vsyscall; + __kernel_sigreturn; + __kernel_rt_sigreturn; + + local: *; + }; +} + +/* The ELF entry point can be used to set the AT_SYSINFO value. */ +ENTRY(__kernel_vsyscall); diff -urN linux-2.6.3.orig/arch/i386/kernel/vsyscall-kml.S linux-2.6.3/arch/i386/kernel/vsyscall-kml.S --- linux-2.6.3.orig/arch/i386/kernel/vsyscall-kml.S Thu Jan 1 09:00:00 1970 +++ linux-2.6.3/arch/i386/kernel/vsyscall-kml.S Sun Feb 22 00:34:22 2004 @@ -0,0 +1,46 @@ +#include + +#ifdef CONFIG_KERNEL_MODE_LINUX +/* + * Code for the vsyscall page. This version uses the KML direct call method. + */ + + .text + .globl __kernel_vsyscall_kml + .type __kernel_vsyscall_kml,@function +__kernel_vsyscall_kml: +.LSTART_vsyscall: + jmp *kml_call_table(,%eax,4) +.LEND_vsyscall: + .size __kernel_vsyscall_kml,.-.LSTART_vsyscall + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI: + .long .LENDCIEDLSI-.LSTARTCIEDLSI +.LSTARTCIEDLSI: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIEDLSI: + .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ +.LSTARTFDEDLSI: + .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 + .align 4 +.LENDFDEDLSI: + .previous +#endif diff -urN linux-2.6.3.orig/arch/i386/kernel/vsyscall-kml.lds.S linux-2.6.3/arch/i386/kernel/vsyscall-kml.lds.S --- linux-2.6.3.orig/arch/i386/kernel/vsyscall-kml.lds.S Thu Jan 1 09:00:00 1970 +++ linux-2.6.3/arch/i386/kernel/vsyscall-kml.lds.S Sun Feb 22 00:34:22 2004 @@ -0,0 +1,21 @@ + +/* This must match . */ +#define VSYSCALL_BASE 0xffffd000 + +#include "vsyscall-common.lds" + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + LINUX_2.5 { + global: + __kernel_vsyscall_kml; + + local: *; + }; +} + +/* The ELF entry point can be used to set the AT_SYSINFO value. */ +ENTRY(__kernel_vsyscall_kml); diff -urN linux-2.6.3.orig/arch/i386/kernel/vsyscall-sysenter.lds.S linux-2.6.3/arch/i386/kernel/vsyscall-sysenter.lds.S --- linux-2.6.3.orig/arch/i386/kernel/vsyscall-sysenter.lds.S Thu Jan 1 09:00:00 1970 +++ linux-2.6.3/arch/i386/kernel/vsyscall-sysenter.lds.S Sun Feb 22 00:34:22 2004 @@ -0,0 +1,23 @@ + +/* This must match . */ +#define VSYSCALL_BASE 0xffffe000 + +#include "vsyscall-common.lds" + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + LINUX_2.5 { + global: + __kernel_vsyscall; + __kernel_sigreturn; + __kernel_rt_sigreturn; + + local: *; + }; +} + +/* The ELF entry point can be used to set the AT_SYSINFO value. */ +ENTRY(__kernel_vsyscall); diff -urN linux-2.6.3.orig/arch/i386/kernel/vsyscall.S linux-2.6.3/arch/i386/kernel/vsyscall.S --- linux-2.6.3.orig/arch/i386/kernel/vsyscall.S Thu Feb 19 01:25:35 2004 +++ linux-2.6.3/arch/i386/kernel/vsyscall.S Sun Feb 22 00:34:22 2004 @@ -12,4 +12,11 @@ .incbin "arch/i386/kernel/vsyscall-sysenter.so" vsyscall_sysenter_end: +#ifdef CONFIG_KERNEL_MODE_LINUX + .globl vsyscall_kml_start, vsyscall_kml_end +vsyscall_kml_start: + .incbin "arch/i386/kernel/vsyscall-kml.so" +vsyscall_kml_end: +#endif + __FINIT diff -urN linux-2.6.3.orig/arch/i386/kernel/vsyscall.lds linux-2.6.3/arch/i386/kernel/vsyscall.lds --- linux-2.6.3.orig/arch/i386/kernel/vsyscall.lds Thu Feb 19 01:25:35 2004 +++ linux-2.6.3/arch/i386/kernel/vsyscall.lds Thu Jan 1 09:00:00 1970 @@ -1,67 +0,0 @@ -/* - * Linker script for vsyscall DSO. The vsyscall page is an ELF shared - * object prelinked to its virtual address, and with only one read-only - * segment (that fits in one page). This script controls its layout. - */ - -/* This must match . */ -VSYSCALL_BASE = 0xffffe000; - -SECTIONS -{ - . = VSYSCALL_BASE + SIZEOF_HEADERS; - - .hash : { *(.hash) } :text - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .gnu.version : { *(.gnu.version) } - .gnu.version_d : { *(.gnu.version_d) } - .gnu.version_r : { *(.gnu.version_r) } - - /* This linker script is used both with -r and with -shared. - For the layouts to match, we need to skip more than enough - space for the dynamic symbol table et al. If this amount - is insufficient, ld -shared will barf. Just increase it here. */ - . = VSYSCALL_BASE + 0x400; - - .text : { *(.text) } :text =0x90909090 - - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - .dynamic : { *(.dynamic) } :text :dynamic - .useless : { - *(.got.plt) *(.got) - *(.data .data.* .gnu.linkonce.d.*) - *(.dynbss) - *(.bss .bss.* .gnu.linkonce.b.*) - } :text -} - -/* - * We must supply the ELF program headers explicitly to get just one - * PT_LOAD segment, and set the flags explicitly to make segments read-only. - */ -PHDRS -{ - text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ - dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ - eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ -} - -/* - * This controls what symbols we export from the DSO. - */ -VERSION -{ - LINUX_2.5 { - global: - __kernel_vsyscall; - __kernel_sigreturn; - __kernel_rt_sigreturn; - - local: *; - }; -} - -/* The ELF entry point can be used to set the AT_SYSINFO value. */ -ENTRY(__kernel_vsyscall); diff -urN linux-2.6.3.orig/arch/i386/mm/fault.c linux-2.6.3/arch/i386/mm/fault.c --- linux-2.6.3.orig/arch/i386/mm/fault.c Thu Feb 19 01:25:35 2004 +++ linux-2.6.3/arch/i386/mm/fault.c Sun Feb 22 00:34:22 2004 @@ -226,6 +226,11 @@ if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) local_irq_enable(); +#ifdef CONFIG_KERNEL_MODE_LINUX + if (regs->xcs == __SW_KM_USER_CS) + error_code |= 0x4; +#endif + tsk = current; info.si_code = SEGV_MAPERR; diff -urN linux-2.6.3.orig/fs/binfmt_elf.c linux-2.6.3/fs/binfmt_elf.c --- linux-2.6.3.orig/fs/binfmt_elf.c Thu Feb 19 01:25:52 2004 +++ linux-2.6.3/fs/binfmt_elf.c Sun Feb 22 00:34:22 2004 @@ -129,7 +129,11 @@ static void create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec, int interp_aout, unsigned long load_addr, - unsigned long interp_load_addr) + unsigned long interp_load_addr +#ifdef CONFIG_KERNEL_MODE_LINUX + , int kernel_mode +#endif +) { unsigned long p = bprm->p; int argc = bprm->argc; @@ -452,6 +456,42 @@ #define INTERPRETER_AOUT 1 #define INTERPRETER_ELF 2 +#ifdef CONFIG_KERNEL_MODE_LINUX +/* + * XXX : we haven't implemented safety check of user programs. + */ +#define TRUSTED_DIR_STR "/trusted/" +#define TRUSTED_DIR_STR_LEN 9 + +static inline int is_safe(struct file* file) +{ + int ret; + char* path; + char* tmp; + struct fs_struct* cur_fs; + + tmp = (char*)__get_free_page(GFP_KERNEL); + + if (!tmp) { + return 0; + } + + path = d_path(file->f_dentry, file->f_vfsmnt, tmp, PAGE_SIZE); + ret = (0 == strncmp(TRUSTED_DIR_STR, path, TRUSTED_DIR_STR_LEN)); + if (ret) { + /* Check whether if we are "chroot"ed */ + /* XXX : I don't know how to check whether if we are chrooted. Is this code correct? */ + cur_fs = current->fs; + read_lock(&cur_fs->lock); + spin_lock(&dcache_lock); + ret = IS_ROOT(cur_fs->root); + spin_unlock(&dcache_lock); + read_unlock(&cur_fs->lock); + } + free_page((unsigned long)tmp); + return ret; +} +#endif static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs) { @@ -475,6 +515,9 @@ struct exec interp_ex; char passed_fileno[6]; struct files_struct *files; +#ifdef CONFIG_KERNEL_MODE_LINUX + int kernel_mode = 0; +#endif /* Get the exec-header */ elf_ex = *((struct elfhdr *) bprm->buf); @@ -813,8 +856,15 @@ compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; +#ifdef CONFIG_KERNEL_MODE_LINUX + kernel_mode = is_safe(bprm->file); +#endif create_elf_tables(bprm, &elf_ex, (interpreter_type == INTERPRETER_AOUT), - load_addr, interp_load_addr); + load_addr, interp_load_addr +#ifdef CONFIG_KERNEL_MODE_LINUX + , kernel_mode +#endif + ); /* N.B. passed_fileno might not be initialized? */ if (interpreter_type == INTERPRETER_AOUT) current->mm->arg_start += strlen(passed_fileno) + 1; @@ -850,7 +900,15 @@ ELF_PLAT_INIT(regs, reloc_func_desc); #endif +#ifndef CONFIG_KERNEL_MODE_LINUX start_thread(regs, elf_entry, bprm->p); +#else + if (kernel_mode) { + start_kernel_thread(regs, elf_entry, bprm->p); + } else { + start_thread(regs, elf_entry, bprm->p); + } +#endif if (unlikely(current->ptrace & PT_PTRACED)) { if (current->ptrace & PT_TRACE_EXEC) ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); diff -urN linux-2.6.3.orig/include/asm-i386/desc.h linux-2.6.3/include/asm-i386/desc.h --- linux-2.6.3.orig/include/asm-i386/desc.h Thu Feb 19 01:25:32 2004 +++ linux-2.6.3/include/asm-i386/desc.h Sun Feb 22 00:34:22 2004 @@ -54,6 +54,25 @@ _set_tssldt_desc(&cpu_gdt_table[cpu][GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82); } +#ifdef CONFIG_KERNEL_MODE_LINUX + +#define _set_codedata_seg_desc(n,addr,type) \ +__asm__ __volatile__ ("movw $0xffff,0(%2)\n\t" \ + "movw %%ax,2(%2)\n\t" \ + "rorl $16,%%eax\n\t" \ + "movb %%al,4(%2)\n\t" \ + "movb %3,5(%2)\n\t" \ + "movb $0xcf,6(%2)\n\t" \ + "movb %%ah,7(%2)\n\t" \ + "rorl $16,%%eax" \ + : "=m"(*(n)) : "a" (addr), "r"(n), "i"(type)) + +static inline void set_ksl_desc(unsigned int cpu, void* addr) +{ + _set_codedata_seg_desc(&cpu_gdt_table[cpu][GDT_ENTRY_KSL], (int)addr, 0x92); +} +#endif + #define LDT_entry_a(info) \ ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) diff -urN linux-2.6.3.orig/include/asm-i386/elf.h linux-2.6.3/include/asm-i386/elf.h --- linux-2.6.3.orig/include/asm-i386/elf.h Thu Feb 19 01:25:33 2004 +++ linux-2.6.3/include/asm-i386/elf.h Sun Feb 22 00:34:22 2004 @@ -131,12 +131,31 @@ #define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE) #define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall) extern void __kernel_vsyscall; +#ifdef CONFIG_KERNEL_MODE_LINUX +#define VSYSCALL_KML_BASE (__fix_to_virt(FIX_VSYSCALL_KML)) +#define VSYSCALL_KML_EHDR ((const struct elfhdr *) VSYSCALL_KML_BASE) +#define VSYSCALL_KML_ENTRY ((unsigned long) &__kernel_vsyscall_kml) +extern void __kernel_vsyscall_kml; +#endif +#ifndef CONFIG_KERNEL_MODE_LINUX #define ARCH_DLINFO \ do { \ NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \ } while (0) +#else +#define ARCH_DLINFO \ +do { \ + if (kernel_mode) { \ + NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_KML_ENTRY); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_KML_BASE);\ + } else { \ + NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \ + } \ +} while (0) +#endif /* * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out diff -urN linux-2.6.3.orig/include/asm-i386/fixmap.h linux-2.6.3/include/asm-i386/fixmap.h --- linux-2.6.3.orig/include/asm-i386/fixmap.h Thu Feb 19 01:25:33 2004 +++ linux-2.6.3/include/asm-i386/fixmap.h Sun Feb 22 00:34:22 2004 @@ -44,6 +44,9 @@ enum fixed_addresses { FIX_HOLE, FIX_VSYSCALL, +#ifdef CONFIG_KERNEL_MODE_LINUX + FIX_VSYSCALL_KML, +#endif #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ #endif diff -urN linux-2.6.3.orig/include/asm-i386/mmu_context.h linux-2.6.3/include/asm-i386/mmu_context.h --- linux-2.6.3.orig/include/asm-i386/mmu_context.h Thu Feb 19 01:25:32 2004 +++ linux-2.6.3/include/asm-i386/mmu_context.h Sun Feb 22 00:34:22 2004 @@ -38,6 +38,10 @@ #endif cpu_set(cpu, next->cpu_vm_mask); +#ifdef CONFIG_KERNEL_MODE_LINUX + doublefault_tsses[cpu].__cr3 = __pa(next->pgd); +#endif + /* Re-load page tables */ load_cr3(next->pgd); diff -urN linux-2.6.3.orig/include/asm-i386/processor.h linux-2.6.3/include/asm-i386/processor.h --- linux-2.6.3.orig/include/asm-i386/processor.h Thu Feb 19 01:25:33 2004 +++ linux-2.6.3/include/asm-i386/processor.h Sun Feb 22 00:34:22 2004 @@ -84,7 +84,13 @@ extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; extern struct tss_struct init_tss[NR_CPUS]; +#ifndef CONFIG_KERNEL_MODE_LINUX extern struct tss_struct doublefault_tss; +#else +extern struct tss_struct doublefault_tsses[NR_CPUS]; +extern struct dft_stack_struct dft_stacks[NR_CPUS]; +extern void init_doublefault(int); +#endif #ifdef CONFIG_SMP extern struct cpuinfo_x86 cpu_data[]; @@ -427,6 +433,14 @@ unsigned long *io_bitmap_ptr; }; +#ifdef CONFIG_KERNEL_MODE_LINUX +struct dft_stack_struct { + unsigned long error_code; + struct tss_struct* current_tss; + struct tss_struct* previous_tss; +}; +#endif + #define INIT_THREAD { \ .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ @@ -459,6 +473,7 @@ } } +#ifndef CONFIG_KERNEL_MODE_LINUX #define start_thread(regs, new_eip, new_esp) do { \ __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \ set_fs(USER_DS); \ @@ -469,6 +484,30 @@ regs->eip = new_eip; \ regs->esp = new_esp; \ } while (0) +#else +#define start_thread(regs, new_eip, new_esp) do { \ + __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \ + set_fs(USER_DS); \ + regs->xds = __USER_DS; \ + regs->xes = __USER_DS; \ + regs->xss = __USER_DS; \ + regs->xcs = __USER_CS; \ + regs->eip = new_eip; \ + regs->esp = new_esp; \ +} while (0) + +#define start_kernel_thread(regs, new_eip, new_esp) do { \ + __asm__("movl %0,%%fs": :"r" (__KERNELSTACK_DS)); \ + __asm__("movl %0,%%gs": :"r" (0)); \ + set_fs(KERNEL_DS); \ + regs->xds = __USER_DS; \ + regs->xes = __USER_DS; \ + regs->xss = __KERNEL_DS; \ + regs->xcs = __SW_KM_USER_CS; \ + regs->eip = new_eip; \ + regs->esp = new_esp; \ +} while (0) +#endif /* Forward declaration, a strange C thing */ struct task_struct; diff -urN linux-2.6.3.orig/include/asm-i386/segment.h linux-2.6.3/include/asm-i386/segment.h --- linux-2.6.3.orig/include/asm-i386/segment.h Thu Feb 19 01:25:33 2004 +++ linux-2.6.3/include/asm-i386/segment.h Sun Feb 22 00:34:22 2004 @@ -1,6 +1,8 @@ #ifndef _ASM_SEGMENT_H #define _ASM_SEGMENT_H +#include + /* * The layout of the per-CPU GDT under Linux: * @@ -42,7 +44,11 @@ * 27 - unused * 28 - unused * 29 - unused +#ifndef CONFIG_KERNEL_MODE_LINUX * 30 - unused +#else + * 30 - Kernel Stack Location segment (KSL) +#endif * 31 - TSS for double fault handler */ #define GDT_ENTRY_TLS_ENTRIES 3 @@ -71,6 +77,11 @@ #define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6) #define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11) +#ifdef CONFIG_KERNEL_MODE_LINUX +#define GDT_ENTRY_KSL 30 +#define __KERNELSTACK_DS (GDT_ENTRY_KSL * 8) +#endif + #define GDT_ENTRY_DOUBLEFAULT_TSS 31 /* @@ -94,5 +105,9 @@ * of tasks we can have.. */ #define IDT_ENTRIES 256 + +#ifdef CONFIG_KERNEL_MODE_LINUX +#define __SW_KM_USER_CS (0xffff0000 | __USER_CS) +#endif #endif diff -urN linux-2.6.3.orig/include/asm-i386/sigcontext.h linux-2.6.3/include/asm-i386/sigcontext.h --- linux-2.6.3.orig/include/asm-i386/sigcontext.h Thu Feb 19 01:25:33 2004 +++ linux-2.6.3/include/asm-i386/sigcontext.h Sun Feb 22 00:34:22 2004 @@ -72,7 +72,11 @@ unsigned long trapno; unsigned long err; unsigned long eip; +#ifndef CONFIG_KERNEL_MODE_LINUX unsigned short cs, __csh; +#else + unsigned long xcs; +#endif unsigned long eflags; unsigned long esp_at_signal; unsigned short ss, __ssh; diff -urN linux-2.6.3.orig/include/linux/mm.h linux-2.6.3/include/linux/mm.h --- linux-2.6.3.orig/include/linux/mm.h Thu Feb 19 01:25:27 2004 +++ linux-2.6.3/include/linux/mm.h Sun Feb 22 00:34:22 2004 @@ -647,5 +647,9 @@ int in_gate_area(struct task_struct *task, unsigned long addr); #endif +#ifdef CONFIG_KERNEL_MODE_LINUX +extern asmlinkage int address_presents_and_writable(unsigned long address); +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff -urN linux-2.6.3.orig/mm/memory.c linux-2.6.3/mm/memory.c --- linux-2.6.3.orig/mm/memory.c Thu Feb 19 01:26:06 2004 +++ linux-2.6.3/mm/memory.c Sun Feb 22 00:34:22 2004 @@ -1725,3 +1725,63 @@ } #endif + +#ifdef CONFIG_KERNEL_MODE_LINUX +static inline int address_presents_and_writable_in_pmd(pmd_t* pmd, unsigned long address) +{ + pte_t* pte; + int result; + + if (pmd_none(*pmd)) + return 0; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + return 0; + } + if (!pmd_present(*pmd)) + return 0; + pte = pte_offset_map(pmd, address); + result = (pte_present(*pte) && pte_write(*pte)); + pte_unmap(pte); + return result; +} + +static inline int address_presents_and_writable_in_pgd(pgd_t* pgd, unsigned long address) +{ + pmd_t* pmd; + + if (pgd_none(*pgd)) + return 0; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + return 0; + } + if (!pgd_present(*pgd)) + return 0; + pmd = pmd_offset(pgd, address); + return address_presents_and_writable_in_pmd(pmd, address); +} + +static inline int address_presents_and_writable_in_mm(struct mm_struct* mm, unsigned long address) +{ + pgd_t* pgd; + + pgd = pgd_offset(mm, address); + return address_presents_and_writable_in_pgd(pgd, address); +} + +asmlinkage int address_presents_and_writable(unsigned long address) +{ + struct mm_struct* mm; + int result; + + mm = current->mm; + if (!mm) + return 0; + spin_lock(&mm->page_table_lock); + result = address_presents_and_writable_in_mm(mm, address); + spin_unlock(&mm->page_table_lock); + return result; +} +#endif +