diff -urN linux-2.5.49.orig/CREDITS linux-2.5.49/CREDITS --- linux-2.5.49.orig/CREDITS Sat Nov 23 06:40:52 2002 +++ linux-2.5.49/CREDITS Sun Nov 24 01:47:01 2002 @@ -1963,6 +1963,10 @@ S: Halifax, Nova Scotia S: Canada B3J 3C8 +N: Toshiyuki Maeda +E: tosh@is.s.u-tokyo.ac.jp +D: Kernel Mode Linux + N: Kai Mäkisara E: Kai.Makisara@metla.fi D: SCSI Tape Driver diff -urN linux-2.5.49.orig/Documentation/00-INDEX linux-2.5.49/Documentation/00-INDEX --- linux-2.5.49.orig/Documentation/00-INDEX Sat Nov 23 06:40:23 2002 +++ linux-2.5.49/Documentation/00-INDEX Sun Nov 24 01:47:01 2002 @@ -112,6 +112,8 @@ - listing of various WWW + books that document kernel internals. kernel-parameters.txt - summary listing of command line / boot prompt args for the kernel. +kml.txt + - info on Kernel Mode Linux. kmod.txt - info on the kernel module loader/unloader (kerneld replacement). ldm.txt diff -urN linux-2.5.49.orig/Documentation/kml.txt linux-2.5.49/Documentation/kml.txt --- linux-2.5.49.orig/Documentation/kml.txt Thu Jan 1 09:00:00 1970 +++ linux-2.5.49/Documentation/kml.txt Sun Nov 24 01:47:01 2002 @@ -0,0 +1,95 @@ +Kernel Mode Linux (http://web.yl.is.s.u-tokyo.ac.jp/~tosh/kml) +Toshiyuki Maeda + + +Introduction: + +Kernel Mode Linux is a technology which enables us to execute user programs +in a kernel mode. In Kernel Mode Linux, user programs can be executed as +user processes that have the privilege level of a kernel mode. +The benefit of executing user programs in a kernel mode +is that the user programs can access a kernel address space directly. +So, for example, user programs can invoke +system calls very fast because it is unnecessary to switch between a kernel +mode and a user mode by using costly software interruptions or context switches. +Unlike kernel modules, user programs are executed +as ordinary processes (except for their privilege level), +so scheduling and paging are performed as usual. + +Although it seems dangerous to let user programs access a kernel directly, +safety of the kernel can be ensured, for example, by static type checking, +software fault isolation, and so forth. +For proof of concept, we are developing a system which is based on the combination +of Kernel Mode Linux and Typed Assembly Language, TAL. +(TAL can ensure safety of programs through its type checking and +the type checking can be done at machine binary level. +For more information about TAL, see http://www.cs.cornell.edu/talc) + + +Note: + +Currently, only IA-32 is supported. +Programs executed in a kernel mode shouldn't modify its CS, DS and SS. +If modified, the system will be in an undefined state. + + +Instruction: + +To enable Kernel Mode Linux, say Y in Kernel Mode Linux field of +kernel configuration, build and install the kernel, and reboot your machine. +Then, all executables under directory /trusted are executed in a kernel mode +in current Kernel Mode Linux implementation. For example, to execute a program +named "cat" in a kernel mode, copy the program to directory /trusted +and execute it as follows: + +% /trusted/cat + + +Implementation for IA-32: + +To execute user programs in a kernel mode, Kernel Mode Linux have +special start_thread (start_kernel_thread) routine, +which is called in execve(2) and set registers +of a user process to specified initial values. The original start_thread +routine set CS segment register to USER_CS. The start_kernel_thread routine +set the CS register to KERNEL_CS (same as DS, SS, and so on). +Thus, a user program is started as a user process executed in a kernel mode. + +The biggest problem to implement Kernel Mode Linux is +a stack starvation problem. Let's assume that a user program is executed +in a kernel mode and it does a page fault on its user stack. +To generate a page fault exception, a IA-32 CPU tries to push several +registers (EIP, CS, and so on) to the same user stack because the program +is executed in a kernel mode and the IA-32 CPU doesn't switch its stack +to a kernel stack. Therefore, the IA-32 CPU cannot push the registers +and generate a double fault exception and fail again. +Finally the IA-32 CPU gives up and reset itself. +This is the stack starvation problem. + +To solve the stack starvation problem, we use IA-32 hardware task mechanism to +handle exceptions. By using IA-32 task, IA-32 CPU doesn't push the registers +to its stack but switch an execution context to special contexts. +Therefore, the stack starvation problem doesn't occur. +However, it is costly to handle all exceptions by IA-32 tasks. +So, in current Kernel Mode Linux implementation, +only a double fault exception is handled by IA-32 task. + +The other problem is a manual stack switching problem. +In normal Linux Kernel, IA-32 CPU switches a stack from a user stack +to a kernel stack at exceptions or interruptions. +However, in Kernel Mode Linux, a user program may be executed in a kernel mode +and IA-32 CPU may not switch a stack. Therefore, +in current Kernel Mode Linux implementation, the kernel switches a stack +manually at exceptions and interruptions. To switch a stack, +a kernel must know a location of a kernel stack in an address space. +However, at exceptions and interruptions, the kernel cannot use +general registers (EAX, EBX, and so on). Therefore, it is very difficult +to get the location of the kernel stack. + +To solve the above problem, current Kernel Mode Linux implementation +exploits a per CPU GDT from Ingo Molnar's TLS patch. In Kernel Mode Linux, +one segment descriptor of the per CPU GDT entries directly points to the +location of the pointer to the kernel stack in a TSS. Thus, by using the +segment descriptor, the address of the kernel stack can be available with +only one general register. + diff -urN linux-2.5.49.orig/MAINTAINERS linux-2.5.49/MAINTAINERS --- linux-2.5.49.orig/MAINTAINERS Sat Nov 23 06:40:48 2002 +++ linux-2.5.49/MAINTAINERS Sun Nov 24 01:47:01 2002 @@ -977,6 +977,12 @@ W: http://kbuild.sourceforge.net S: Maintained +KERNEL MODE LINUX +P: Toshiyuki Maeda +M: tosh@is.s.u-tokyo.ac.jp +W: http://www.yl.is.s.u-tokyo.ac.jp/~tosh/kml/ +S: Maintained + KERNEL NFSD P: Neil Brown M: neilb@cse.unsw.edu.au diff -urN linux-2.5.49.orig/Makefile linux-2.5.49/Makefile --- linux-2.5.49.orig/Makefile Sat Nov 23 06:40:19 2002 +++ linux-2.5.49/Makefile Sun Nov 24 01:47:22 2002 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 5 SUBLEVEL = 49 -EXTRAVERSION = +EXTRAVERSION = -kml # *DOCUMENTATION* # To see a list of typical targets execute "make help" diff -urN linux-2.5.49.orig/arch/i386/Kconfig linux-2.5.49/arch/i386/Kconfig --- linux-2.5.49.orig/arch/i386/Kconfig Sat Nov 23 06:40:21 2002 +++ linux-2.5.49/arch/i386/Kconfig Sun Nov 24 01:47:01 2002 @@ -1513,6 +1513,29 @@ source "arch/i386/oprofile/Kconfig" +if X86_WP_WORKS_OK + +menu "Kernel Mode Linux" + +config KERNEL_MODE_LINUX + bool "Kernel Mode Linux" + ---help--- + This enables Kernel Mode Linux. In Kernel Mode Linux, user programs + can be executed safely in a kernel mode and access a kernel address space + directly. Thus, for example, costly mode switching between a user and a kernel + can be eliminated. If you say Y here, the kernel enables Kernel Mode Linux. + + More information about Kernel Mode Linux can be found in the + + + If you don't know what to do here, say N. + +comment "Safety check have not been implemented" +depends on KERNEL_MODE_LINUX + +endmenu + +endif menu "Kernel hacking" diff -urN linux-2.5.49.orig/arch/i386/kernel/cpu/common.c linux-2.5.49/arch/i386/kernel/cpu/common.c --- linux-2.5.49.orig/arch/i386/kernel/cpu/common.c Sat Nov 23 06:40:14 2002 +++ linux-2.5.49/arch/i386/kernel/cpu/common.c Sun Nov 24 01:47:01 2002 @@ -440,6 +440,10 @@ int cpu = smp_processor_id(); struct tss_struct * t = init_tss + cpu; struct thread_struct *thread = ¤t->thread; +#ifdef CONFIG_KERNEL_MODE_LINUX + struct tss_struct* d = init_dft + cpu; + struct dft_stack_struct* ds = dft_stack + cpu; +#endif if (test_and_set_bit(cpu, &cpu_initialized)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); @@ -492,6 +496,18 @@ cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff; load_TR_desc(); load_LDT(&init_mm.context); + +#ifdef CONFIG_KERNEL_MODE_LINUX + set_ksl_desc(cpu, &t->esp0); + __asm__ ("pushl $0x00004002; popl %0\n\t" : "=m" (d->eflags)); + d->esp = (unsigned long)(&(ds->error_code) + 1); + ds->current_tss = d; + ds->previous_tss = t; + set_dft_desc(cpu, d); + + t->ldt = GDT_ENTRY_LDT << 3; + d->ldt = GDT_ENTRY_LDT << 3; +#endif /* Clear %fs and %gs. */ asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); diff -urN linux-2.5.49.orig/arch/i386/kernel/entry.S linux-2.5.49/arch/i386/kernel/entry.S --- linux-2.5.49.orig/arch/i386/kernel/entry.S Sat Nov 23 06:40:24 2002 +++ linux-2.5.49/arch/i386/kernel/entry.S Sun Nov 24 01:47:01 2002 @@ -47,6 +47,7 @@ #include #include #include +#include #include "irq_vectors.h" EBX = 0x00 @@ -61,6 +62,14 @@ ORIG_EAX = 0x24 EIP = 0x28 CS = 0x2C +#ifdef CONFIG_KERNEL_MODE_LINUX +/* + * CS_HW is used as stack switch indicator. + * If CS_HW is non-zero, stack switch occured. + * That is, we were in Kernel-User mode before interruption. + */ +CS_HW = 0x2E +#endif EFLAGS = 0x30 OLDESP = 0x34 OLDSS = 0x38 @@ -94,6 +103,7 @@ movl %edx, %ds; \ movl %edx, %es; +#ifndef CONFIG_KERNEL_MODE_LINUX #define RESTORE_ALL \ popl %ebx; \ popl %ecx; \ @@ -124,6 +134,139 @@ .long 2b,5b; \ .long 3b,6b; \ .previous +#else +#define RESTORE_ALL \ + popl %ebx; \ + popl %ecx; \ + popl %edx; \ + popl %esi; \ + popl %edi; \ + popl %ebp; \ + popl %eax; \ +1: popl %ds; \ +2: popl %es; \ + addl $4,%esp; \ +/* Switch stack KK -> KU. */ \ + /* check whether if stack switch occured or not */ \ + cmpw $0x0, 6(%esp); \ + je 8f; \ + /* clear stack switch record in XCS */ \ + movw $0x0, 6(%esp); \ + pushl %ebp; \ + movl 16(%esp), %ebp; \ + addl $-16, %ebp; \ +3: popl (%ebp); \ +4: popl 4(%ebp); \ +5: popl 8(%ebp); \ +6: popl 12(%ebp); \ + movl %ebp, %esp; \ +7: popl %ebp; \ +8: iret; \ +.section __ex_table,"a";\ + .align 4; \ + .long 1b,3f; \ + .long 2b,4f; \ + .long 3b,5f; \ + .long 4b,5f; \ + .long 5b,5f; \ + .long 6b,5f; \ + .long 7b,5f; \ + .long 8b,5f; \ +.previous; \ +.section .fixup,"ax"; \ +3: movl $0,(%esp); \ + jmp 1b; \ +4: movl $0,(%esp); \ + jmp 2b; \ +5: pushl %ss; \ + popl %ds; \ + pushl %ss; \ + popl %es; \ + pushl $11; \ + call do_exit; \ +.previous +#endif + +#ifndef CONFIG_KERNEL_MODE_LINUX +#define SWITCH_STACK_TO_KK +#define SWITCH_STACK_TO_KK_WITH_ERROR_CODE +#else + +#define TASK_SIZE (__PAGE_OFFSET) +#define __SW_KERNEL_CS (0xffff0000 | __KERNEL_CS) + +/* + * This is a macro for stack switching. + */ +#define SWITCH_STACK_TO_KK \ + /* Check whether if we were in Kernel-User mode or not. */ \ + cmpl $(TASK_SIZE), %esp; \ + /* For anceint processors, clear stack switch in XCS */ \ + /* because they doesn't clear High 16 bits of XCS. */ \ + movw $0x0, 6(%esp); \ + ja 3f; \ + /* \ + * We were in Kernel-User mode, \ + * therefore, XCS == __KERNEL_CS. \ + * Thus, we can safely overwrite XCS \ + */ \ + movl %ebp, 4(%esp); /* save %ebp to XCS */ \ + movl %ds, %ebp; \ + cmpw $(__KERNELSTACK_DS), %bp; \ + je 1f; \ + movl $(__KERNELSTACK_DS), %ebp; \ + movl %ebp, %ds; \ +1: \ + movl %esp, %ebp; \ + movl (0x0), %esp; \ + je 2f; \ + pushl $(__KERNEL_DS); \ + popl %ds; \ +2: \ + addl $12, %ebp; \ + addl $-4, %esp; /* XSS */ \ + pushl %ebp; /* ESP */ \ + pushl -4(%ebp); /* EFLAGS */ \ + pushl $(__SW_KERNEL_CS); /* XCS */ \ + pushl -12(%ebp); /* EIP */ \ + movl -8(%ebp), %ebp; /* restore %ebp from XCS */ \ +3: + +/* + * This is as same as the SWITCH_STACK_TO_KK + * but handles an error code on a stack + */ +#define SWITCH_STACK_TO_KK_WITH_ERROR_CODE \ + cmpl $(TASK_SIZE), %esp; \ + movw $0x0, 10(%esp); /* clear stack switch in XCS, sigh... */ \ + ja 3f; \ + /* \ + * We are in Kernel-User mode, \ + * therefore, XCS == __KERNEL_CS. \ + */ \ + movl %ebp, 8(%esp); /* save %ebp to XCS */ \ + movl %ds, %ebp; \ + cmpw $(__KERNELSTACK_DS), %bp; \ + je 1f; \ + movl $(__KERNELSTACK_DS), %ebp; \ + movl %ebp, %ds; \ +1: \ + movl %esp, %ebp; \ + movl (0x0), %esp; \ + je 2f; \ + pushl $(__KERNEL_DS); \ + popl %ds; \ +2: \ + addl $16, %ebp; \ + addl $-4, %esp; /* XSS */ \ + pushl %ebp; /* ESP */ \ + pushl -4(%ebp); /* EFLAGS */ \ + pushl $(__SW_KERNEL_CS); /* XCS */ \ + pushl -12(%ebp); /* EIP */ \ + pushl -16(%ebp); /* error_code */ \ + movl -8(%ebp), %ebp; /* restore %ebp from XCS */ \ + 3: +#endif ENTRY(lcall7) pushfl # We get a different stack layout with call @@ -190,6 +333,10 @@ GET_THREAD_INFO(%ebx) movl EFLAGS(%esp), %eax # mix EFLAGS and CS movb CS(%esp), %al +#ifdef CONFIG_KERNEL_MODE_LINUX + cmpw $0x0, CS_HW(%esp) # return to Kernel-User mode? + jne resume_userspace +#endif testl $(VM_MASK | 3), %eax jz resume_kernel # returning to kernel or vm86-space ENTRY(resume_userspace) @@ -223,6 +370,7 @@ # system call handler stub ALIGN ENTRY(system_call) + SWITCH_STACK_TO_KK pushl %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebx) @@ -322,10 +470,12 @@ ENTRY(irq_entries_start) .rept NR_IRQS ALIGN -1: pushl $vector-256 +0: /* local label "1", "2" and "3" are used in SWITCH_STACK_TO_KK ! */ + SWITCH_STACK_TO_KK + pushl $vector-256 jmp common_interrupt .data - .long 1b + .long 0b .text vector=vector+1 .endr @@ -338,6 +488,7 @@ #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ + SWITCH_STACK_TO_KK \ pushl $nr-256; \ SAVE_ALL \ call smp_/**/name; \ @@ -347,6 +498,7 @@ #include "entry_arch.h" ENTRY(divide_error) + SWITCH_STACK_TO_KK pushl $0 # no error code pushl $do_divide_error ALIGN @@ -378,16 +530,19 @@ jmp ret_from_exception ENTRY(coprocessor_error) + SWITCH_STACK_TO_KK pushl $0 pushl $do_coprocessor_error jmp error_code ENTRY(simd_coprocessor_error) + SWITCH_STACK_TO_KK pushl $0 pushl $do_simd_coprocessor_error jmp error_code ENTRY(device_not_available) + SWITCH_STACK_TO_KK pushl $-1 # mark this as an int SAVE_ALL movl %cr0, %eax @@ -403,11 +558,13 @@ jmp ret_from_exception ENTRY(debug) + SWITCH_STACK_TO_KK pushl $0 pushl $do_debug jmp error_code ENTRY(nmi) + SWITCH_STACK_TO_KK pushl %eax SAVE_ALL movl %esp, %edx @@ -418,69 +575,214 @@ RESTORE_ALL ENTRY(int3) + SWITCH_STACK_TO_KK pushl $0 pushl $do_int3 jmp error_code ENTRY(overflow) + SWITCH_STACK_TO_KK pushl $0 pushl $do_overflow jmp error_code ENTRY(bounds) + SWITCH_STACK_TO_KK pushl $0 pushl $do_bounds jmp error_code ENTRY(invalid_op) + SWITCH_STACK_TO_KK pushl $0 pushl $do_invalid_op jmp error_code ENTRY(coprocessor_segment_overrun) + SWITCH_STACK_TO_KK pushl $0 pushl $do_coprocessor_segment_overrun jmp error_code ENTRY(double_fault) + SWITCH_STACK_TO_KK_WITH_ERROR_CODE + pushl $do_double_fault + jmp error_code + +#ifdef CONFIG_KERNEL_MODE_LINUX +ENTRY(double_fault_no_stack_switch) pushl $do_double_fault jmp error_code +PAGE_FAULT_ERROR_CODE = 0x2 +TSS_CR3 = 28 +TSS_EIP = 32 +TSS_EFLAGS = 36 +TSS_CS = 76 +TSS_ESP = 56 +TSS_SS = 80 + +/* + * This is a task-handler for double fault. + * In Kernel Mode Linux, user programs may be executed in ring 0 (kernel mode). + * Therefore, normal interruption handling mechanism doesn't work. + * For example, if a page fault occurs in a stack, + * CPU cannot generate a page fault exception because there is no stack + * to save the CPU context. We call this problem "stack starvation". + * To solve the stack starvation, we handle double fault with task-handler. + */ +ENTRY(double_fault_task) + movl 4(%esp), %edi # get current TSS. +/* %edi = current_tss */ + movl 8(%esp), %ebx # get previous TSS. +/* %ebx = prev_tss */ + + # get kernel stack. + cmpw $__KERNEL_CS, TSS_CS(%ebx) + jne 1f + movl TSS_ESP(%ebx), %esi + cmpl $TASK_SIZE, %esi + ja 2f +1: + movl $(__KERNELSTACK_DS), %eax + movl %eax, %ds + movl (0x0), %esi + movl $(__KERNEL_DS), %eax + movl %eax, %ds +2: + movl %esi, %esp +/* From now on, we can use stack. */ + + # recreate stack layout as if normal interruption occurs. + cmpw $__KERNEL_CS, TSS_CS(%ebx) + jne 3f + movl TSS_ESP(%ebx), %esi + cmpl $TASK_SIZE, %esi + ja 4f +3: + pushl TSS_SS(%ebx) + pushl TSS_ESP(%ebx) + + movl TSS_ESP(%ebx), %esi +4: + pushl TSS_EFLAGS(%ebx) + pushl TSS_CS(%ebx) + pushl TSS_EIP(%ebx) + + movw $0x0, 6(%esp) + cmpw $__KERNEL_CS, TSS_CS(%ebx) + jne 5f + cmpl $TASK_SIZE, %esi + ja 5f + /* record stack switch in XCS */ + movw $0xffff, 6(%esp) +5: + + # check whether if stack starvation occured or not. +/* %esi = prev_tss->esp */ + # calling address_presents_and_writable + addl $-4, %esi /* %esi = prev_tss->esp - 4 */ + addl $-12, %esp + pushl %esi + call address_presents_and_writable + addl $16, %esp + + testl %eax, %eax + jne 7f +6: + pushl $PAGE_FAULT_ERROR_CODE + movl $page_fault_no_stack_switch, TSS_EIP(%ebx) + andb $253, 37(%ebx) /* == andl $~IF_MASK, TSS_EFLAGS(%ebx) */ + movl %esi, %eax + movl %eax, %cr2 + jmp 9f +7: + addl $-12, %esi /* %esi = prev_tss->esp - 16 */ + addl $-12, %esp + pushl %esi + call address_presents_and_writable + addl $16, %esp + + testl %eax, %eax + jne 8f + jmp 6b +8: + pushl $0 + movl $double_fault_no_stack_switch, TSS_EIP(%ebx) +9: + andb $254, 37(%ebx) /* == andl $~TF_MASK, TSS_EFLAGS(%ebx) */ + movw $__KERNEL_CS, TSS_CS(%ebx) + movl %esp, TSS_ESP(%ebx) + movw $__KERNEL_DS, TSS_SS(%ebx) + + movl TSS_CR3(%edi), %eax + movl %eax, TSS_CR3(%ebx) + + movl TSS_ESP(%edi), %esp + + iret + jmp double_fault_task +#endif + ENTRY(invalid_TSS) + SWITCH_STACK_TO_KK_WITH_ERROR_CODE pushl $do_invalid_TSS jmp error_code ENTRY(segment_not_present) + SWITCH_STACK_TO_KK_WITH_ERROR_CODE pushl $do_segment_not_present jmp error_code ENTRY(stack_segment) + SWITCH_STACK_TO_KK_WITH_ERROR_CODE pushl $do_stack_segment jmp error_code ENTRY(general_protection) + SWITCH_STACK_TO_KK_WITH_ERROR_CODE pushl $do_general_protection jmp error_code ENTRY(alignment_check) + SWITCH_STACK_TO_KK_WITH_ERROR_CODE pushl $do_alignment_check jmp error_code ENTRY(page_fault) + SWITCH_STACK_TO_KK_WITH_ERROR_CODE pushl $do_page_fault jmp error_code +#ifdef CONFIG_KERNEL_MODE_LINUX +ENTRY(page_fault_no_stack_switch) + pushl $do_page_fault + jmp error_code +#endif + #ifdef CONFIG_X86_MCE ENTRY(machine_check) + SWITCH_STACK_TO_KK pushl $0 pushl $do_machine_check jmp error_code #endif ENTRY(spurious_interrupt_bug) + SWITCH_STACK_TO_KK pushl $0 pushl $do_spurious_interrupt_bug jmp error_code + +#ifdef CONFIG_KERNEL_MODE_LINUX +ENTRY(get_kernelstack_address) + pushl %fs + pushl $(__KERNELSTACK_DS) + popl %fs + movl %fs:0x0, %eax + popl %fs + ret +#endif .data ENTRY(sys_call_table) diff -urN linux-2.5.49.orig/arch/i386/kernel/head.S linux-2.5.49/arch/i386/kernel/head.S --- linux-2.5.49.orig/arch/i386/kernel/head.S Sat Nov 23 06:40:19 2002 +++ linux-2.5.49/arch/i386/kernel/head.S Sun Nov 24 01:47:01 2002 @@ -441,6 +441,12 @@ .quad 0x00409a0000000000 /* 0xa8 APM CS code */ .quad 0x00009a0000000000 /* 0xb0 APM CS 16 code (16 bit) */ .quad 0x0040920000000000 /* 0xb8 APM DS data */ +#ifdef CONFIG_KERNEL_MODE_LINUX + .quad 0x0000000000000000 /* 0xc0 Kernel Stack Location segment (KSL) set at runtime*/ + .quad 0x0000000000000000 /* 0xc8 Double Fault Task (DFT) set at runtime */ + .quad 0x0000000000000000 /* 0xd0 not used */ + .quad 0x0000000000000000 /* 0xd8 not used */ +#endif #if CONFIG_SMP .fill (NR_CPUS-1)*GDT_ENTRIES,8,0 /* other CPU's GDT */ diff -urN linux-2.5.49.orig/arch/i386/kernel/init_task.c linux-2.5.49/arch/i386/kernel/init_task.c --- linux-2.5.49.orig/arch/i386/kernel/init_task.c Sat Nov 23 06:41:11 2002 +++ linux-2.5.49/arch/i386/kernel/init_task.c Sun Nov 24 01:47:01 2002 @@ -40,3 +40,11 @@ */ struct tss_struct init_tss[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = INIT_TSS }; +#ifdef CONFIG_KERNEL_MODE_LINUX +/* + * We need per cpu TSS of double fault task-handler + * because task-handler cannot be executed cocurrently. + */ +struct tss_struct init_dft[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = INIT_DFT }; +struct dft_stack_struct dft_stack[NR_CPUS] __cacheline_aligned; +#endif diff -urN linux-2.5.49.orig/arch/i386/kernel/signal.c linux-2.5.49/arch/i386/kernel/signal.c --- linux-2.5.49.orig/arch/i386/kernel/signal.c Sat Nov 23 06:40:41 2002 +++ linux-2.5.49/arch/i386/kernel/signal.c Sun Nov 24 01:47:01 2002 @@ -159,10 +159,22 @@ err |= __get_user(tmp, &sc->seg); \ regs->x##seg = tmp; } +#ifndef CONFIG_KERNEL_MODE_LINUX #define COPY_SEG_STRICT(seg) \ { unsigned short tmp; \ err |= __get_user(tmp, &sc->seg); \ regs->x##seg = tmp|3; } +#else +#define COPY_SEG_STRICT(seg) \ + { unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->x##seg = tmp|(regs->x##seg & 3); } + +#define COPY_CS_STRICT \ + { unsigned long tmp; \ + err |= __get_user(tmp, &sc->xcs); \ + regs->xcs = tmp|(regs->xcs & 3); } +#endif #define GET_SEG(seg) \ { unsigned short tmp; \ @@ -181,7 +193,11 @@ COPY(edx); COPY(ecx); COPY(eip); +#ifndef CONFIG_KERNEL_MODE_LINUX COPY_SEG_STRICT(cs); +#else + COPY_CS_STRICT; +#endif COPY_SEG_STRICT(ss); { @@ -302,7 +318,11 @@ err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); err |= __put_user(regs->eip, &sc->eip); +#ifndef CONFIG_KERNEL_MODE_LINUX err |= __put_user(regs->xcs, (unsigned int *)&sc->cs); +#else + err |= __put_user(regs->xcs, &sc->xcs); +#endif err |= __put_user(regs->eflags, &sc->eflags); err |= __put_user(regs->esp, &sc->esp_at_signal); err |= __put_user(regs->xss, (unsigned int *)&sc->ss); @@ -338,11 +358,20 @@ } /* This is the legacy signal stack switching. */ +#ifndef CONFIG_KERNEL_MODE_LINUX else if ((regs->xss & 0xffff) != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { esp = (unsigned long) ka->sa.sa_restorer; } +#else + else if ((regs->xss & 0xffff) != __USER_DS && + (regs->esp > TASK_SIZE) && + !(ka->sa.sa_flags & SA_RESTORER) && + ka->sa.sa_restorer) { + esp = (unsigned long) ka->sa.sa_restorer; + } +#endif return (void *)((esp - frame_size) & -8ul); } @@ -397,11 +426,13 @@ regs->esp = (unsigned long) frame; regs->eip = (unsigned long) ka->sa.sa_handler; +#ifndef CONFIG_KERNEL_MODE_LINUX set_fs(USER_DS); regs->xds = __USER_DS; regs->xes = __USER_DS; regs->xss = __USER_DS; regs->xcs = __USER_CS; +#endif regs->eflags &= ~TF_MASK; #if DEBUG_SIG @@ -472,11 +503,13 @@ regs->esp = (unsigned long) frame; regs->eip = (unsigned long) ka->sa.sa_handler; +#ifndef CONFIG_KERNEL_MODE_LINUX set_fs(USER_DS); regs->xds = __USER_DS; regs->xes = __USER_DS; regs->xss = __USER_DS; regs->xcs = __USER_CS; +#endif regs->eflags &= ~TF_MASK; #if DEBUG_SIG @@ -556,8 +589,13 @@ * kernel mode. Just return without doing anything * if so. */ +#ifndef CONFIG_KERNEL_MODE_LINUX if ((regs->xcs & 3) != 3) return 1; +#else + if ((regs->xcs & 3) != 3 && (regs->xcs & 0xffff0000) == 0) + return 1; +#endif if (current->flags & PF_FREEZE) { refrigerator(0); diff -urN linux-2.5.49.orig/arch/i386/kernel/traps.c linux-2.5.49/arch/i386/kernel/traps.c --- linux-2.5.49.orig/arch/i386/kernel/traps.c Sat Nov 23 06:40:22 2002 +++ linux-2.5.49/arch/i386/kernel/traps.c Sun Nov 24 01:47:01 2002 @@ -197,6 +197,18 @@ show_trace(&stack); } +#ifndef CONFIG_KERNEL_MODE_LINUX +static inline int in_user_mode(struct pt_regs* regs) +{ + return (regs->xcs & 3); +} +#else +static inline int in_user_mode(struct pt_regs* regs) +{ + return (regs->xcs & 0xffff0003); +} +#endif + void show_registers(struct pt_regs *regs) { int i; @@ -206,7 +218,7 @@ esp = (unsigned long) (®s->esp); ss = __KERNEL_DS; - if (regs->xcs & 3) { + if (in_user_mode(regs)) { in_kernel = 0; esp = regs->esp; ss = regs->xss & 0xffff; @@ -304,7 +316,7 @@ static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) { - if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs)) + if (!(regs->eflags & VM_MASK) && !in_user_mode(regs)) die(str, regs, err); } @@ -323,7 +335,7 @@ if (vm86 && regs->eflags & VM_MASK) goto vm86_trap; - if (!(regs->xcs & 3)) + if (!in_user_mode(regs)) goto kernel_trap; trap_signal: { @@ -421,7 +433,7 @@ if (regs->eflags & VM_MASK) goto gp_in_vm86; - if (!(regs->xcs & 3)) + if (!in_user_mode(regs)) goto gp_in_kernel; current->thread.error_code = error_code; @@ -620,8 +632,8 @@ /* If this is a kernel mode trap, save the user PC on entry to * the kernel, that's what the debugger can make sense of. */ - info.si_addr = ((regs->xcs & 3) == 0) ? (void *)tsk->thread.eip : - (void *)regs->eip; + info.si_addr = (!in_user_mode(regs)) ? (void *)tsk->thread.eip : + (void *)regs->eip; force_sig_info(SIGTRAP, &info, tsk); /* Disable additional traps. They'll be re-enabled when @@ -848,6 +860,21 @@ "3" ((char *) (addr)),"2" (__KERNEL_CS << 16)); \ } while (0) +#ifdef CONFIG_KERNEL_MODE_LINUX +#define _set_task_gate(gate_addr,dpl,tss_sel) \ +do { \ + int __d0, __d1; \ + __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \ + "movw %4,%%dx\n\t" \ + "movl %%eax,%0\n\t" \ + "movl %%edx,%1" \ + :"=m" (*((long *) (gate_addr))), \ + "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \ + :"i" ((short) (0x8000+(dpl<<13)+(5<<8))), \ + "3" (0),"2" (tss_sel << 16)); \ +} while (0) + +#endif /* * This needs to use 'idt_table' rather than 'idt', and @@ -875,6 +902,12 @@ _set_gate(a,12,3,addr); } +#ifdef CONFIG_KERNEL_MODE_LINUX +static void __init set_task_gate(unsigned int n, unsigned int tss_sel) +{ + _set_task_gate(idt_table+n,0,tss_sel); +} +#endif #ifdef CONFIG_EISA int EISA_bus; @@ -903,7 +936,11 @@ set_system_gate(5,&bounds); set_trap_gate(6,&invalid_op); set_trap_gate(7,&device_not_available); +#ifndef CONFIG_KERNEL_MODE_LINUX set_trap_gate(8,&double_fault); +#else + set_task_gate(8,(GDT_ENTRY_DFT << 3)); +#endif set_trap_gate(9,&coprocessor_segment_overrun); set_trap_gate(10,&invalid_TSS); set_trap_gate(11,&segment_not_present); diff -urN linux-2.5.49.orig/arch/i386/mm/fault.c linux-2.5.49/arch/i386/mm/fault.c --- linux-2.5.49.orig/arch/i386/mm/fault.c Sat Nov 23 06:40:12 2002 +++ linux-2.5.49/arch/i386/mm/fault.c Sun Nov 24 01:47:01 2002 @@ -139,6 +139,18 @@ asmlinkage void do_invalid_op(struct pt_regs *, unsigned long); +#ifndef CONFIG_KERNEL_MODE_LINUX +static inline int user_mode_access(unsigned long error_code, struct pt_regs* regs) +{ + return (error_code & 4); +} +#else +static inline int user_mode_access(unsigned long error_code, struct pt_regs* regs) +{ + return (error_code & 4) || (regs->xcs & 0xffff0000); +} +#endif + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -182,7 +194,7 @@ * (error_code & 4) == 0, and that the fault was not a * protection error (error_code & 1) == 0. */ - if (address >= TASK_SIZE && !(error_code & 5)) + if (address >= TASK_SIZE && !(error_code & 1) && !user_mode_access(error_code, regs)) goto vmalloc_fault; mm = tsk->mm; @@ -204,7 +216,7 @@ goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; - if (error_code & 4) { + if (user_mode_access(error_code, regs)) { /* * accessing the stack below %esp is always a bug. * The "+ 32" is there due to some instructions (like @@ -226,7 +238,11 @@ switch (error_code & 3) { default: /* 3: write, present */ #ifdef TEST_VERIFY_AREA - if (regs->cs == KERNEL_CS) +#ifndef CONFIG_KERNEL_MODE_LINUX + if (regs->xcs == KERNEL_CS) +#else + if (regs->xcs == KERNEL_CS && !(regs->xcs & 0xffff0000)) +#endif printk("WP fault at %08lx\n", regs->eip); #endif /* fall through */ @@ -282,7 +298,7 @@ up_read(&mm->mmap_sem); /* User mode accesses just cause a SIGSEGV */ - if (error_code & 4) { + if (user_mode_access(error_code, regs)) { tsk->thread.cr2 = address; tsk->thread.error_code = error_code; tsk->thread.trap_no = 14; @@ -364,7 +380,7 @@ goto survive; } printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) + if (user_mode_access(error_code, regs)) do_exit(SIGKILL); goto no_context; @@ -385,7 +401,7 @@ force_sig_info(SIGBUS, &info, tsk); /* Kernel mode? Handle exceptions or die */ - if (!(error_code & 4)) + if (!user_mode_access(error_code, regs)) goto no_context; return; diff -urN linux-2.5.49.orig/fs/binfmt_elf.c linux-2.5.49/fs/binfmt_elf.c --- linux-2.5.49.orig/fs/binfmt_elf.c Sat Nov 23 06:40:41 2002 +++ linux-2.5.49/fs/binfmt_elf.c Sun Nov 24 01:47:01 2002 @@ -440,6 +440,42 @@ #define INTERPRETER_AOUT 1 #define INTERPRETER_ELF 2 +#ifdef CONFIG_KERNEL_MODE_LINUX +/* + * XXX : we haven't implemented safety check of user programs. + */ +#define TRUSTED_DIR_STR "/trusted/" +#define TRUSTED_DIR_STR_LEN 9 + +static inline int is_safe(struct file* file) +{ + int ret; + char* path; + char* tmp; + struct fs_struct* cur_fs; + + tmp = (char*)__get_free_page(GFP_KERNEL); + + if (!tmp) { + return 0; + } + + path = d_path(file->f_dentry, file->f_vfsmnt, tmp, PAGE_SIZE); + ret = (0 == strncmp(TRUSTED_DIR_STR, path, TRUSTED_DIR_STR_LEN)); + if (ret) { + /* Check whether if we are "chroot"ed */ + /* XXX : I don't know how to check whether if we are chrooted. Is this code correct? */ + cur_fs = current->fs; + read_lock(&cur_fs->lock); + spin_lock(&dcache_lock); + ret = IS_ROOT(cur_fs->root); + spin_unlock(&dcache_lock); + read_unlock(&cur_fs->lock); + } + free_page((unsigned long)tmp); + return ret; +} +#endif static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs) { @@ -780,7 +816,15 @@ ELF_PLAT_INIT(regs); #endif +#ifndef CONFIG_KERNEL_MODE_LINUX start_thread(regs, elf_entry, bprm->p); +#else + if (is_safe(bprm->file)) { + start_kernel_thread(regs, elf_entry, bprm->p); + } else { + start_thread(regs, elf_entry, bprm->p); + } +#endif if (unlikely(current->ptrace & PT_PTRACED)) { if (current->ptrace & PT_TRACE_EXEC) ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); diff -urN linux-2.5.49.orig/include/asm-i386/desc.h linux-2.5.49/include/asm-i386/desc.h --- linux-2.5.49.orig/include/asm-i386/desc.h Sat Nov 23 06:40:12 2002 +++ linux-2.5.49/include/asm-i386/desc.h Sun Nov 24 01:47:01 2002 @@ -38,6 +38,19 @@ "rorl $16,%%eax" \ : "=m"(*(n)) : "a" (addr), "r"(n), "ir"(limit), "i"(type)) +#ifdef CONFIG_KERNEL_MODE_LINUX +#define _set_codedata_seg_desc(n,addr,type) \ +__asm__ __volatile__ ("movw $0xffff,0(%2)\n\t" \ + "movw %%ax,2(%2)\n\t" \ + "rorl $16,%%eax\n\t" \ + "movb %%al,4(%2)\n\t" \ + "movb %3,5(%2)\n\t" \ + "movb $0xcf,6(%2)\n\t" \ + "movb %%ah,7(%2)\n\t" \ + "rorl $16,%%eax" \ + : "=m"(*(n)) : "a" (addr), "r"(n), "i"(type)) +#endif + static inline void set_tss_desc(unsigned int cpu, void *addr) { _set_tssldt_desc(&cpu_gdt_table[cpu][GDT_ENTRY_TSS], (int)addr, 235, 0x89); @@ -47,6 +60,18 @@ { _set_tssldt_desc(&cpu_gdt_table[cpu][GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82); } + +#ifdef CONFIG_KERNEL_MODE_LINUX +static inline void set_ksl_desc(unsigned int cpu, void* addr) +{ + _set_codedata_seg_desc(&cpu_gdt_table[cpu][GDT_ENTRY_KSL], (int)addr, 0x92); +} + +static inline void set_dft_desc(unsigned int cpu, void* addr) +{ + _set_tssldt_desc(&cpu_gdt_table[cpu][GDT_ENTRY_DFT], (int)addr, 235, 0x89); +} +#endif #define LDT_entry_a(info) \ ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) diff -urN linux-2.5.49.orig/include/asm-i386/mmu_context.h linux-2.5.49/include/asm-i386/mmu_context.h --- linux-2.5.49.orig/include/asm-i386/mmu_context.h Sat Nov 23 06:40:12 2002 +++ linux-2.5.49/include/asm-i386/mmu_context.h Sun Nov 24 01:47:01 2002 @@ -36,7 +36,9 @@ cpu_tlbstate[cpu].active_mm = next; #endif set_bit(cpu, &next->cpu_vm_mask); - +#ifdef CONFIG_KERNEL_MODE_LINUX + init_dft[cpu].__cr3 = __pa(next->pgd); +#endif /* Re-load page tables */ load_cr3(next->pgd); diff -urN linux-2.5.49.orig/include/asm-i386/processor.h linux-2.5.49/include/asm-i386/processor.h --- linux-2.5.49.orig/include/asm-i386/processor.h Sat Nov 23 06:40:14 2002 +++ linux-2.5.49/include/asm-i386/processor.h Sun Nov 24 01:47:01 2002 @@ -78,6 +78,10 @@ extern struct cpuinfo_x86 boot_cpu_data; extern struct tss_struct init_tss[NR_CPUS]; +#ifdef CONFIG_KERNEL_MODE_LINUX +extern struct tss_struct init_dft[NR_CPUS]; +extern struct dft_stack_struct dft_stack[NR_CPUS]; +#endif #ifdef CONFIG_SMP extern struct cpuinfo_x86 cpu_data[]; @@ -385,6 +389,14 @@ unsigned long *ts_io_bitmap; }; +#ifdef CONFIG_KERNEL_MODE_LINUX +struct dft_stack_struct { + unsigned long error_code; + struct tss_struct* current_tss; + struct tss_struct* previous_tss; +}; +#endif + #define INIT_THREAD { \ { { 0, 0 } , }, \ 0, \ @@ -412,6 +424,44 @@ {~0, } /* ioperm */ \ } +#ifdef CONFIG_KERNEL_MODE_LINUX +extern void double_fault_task(void); + +#define INIT_DFT { \ + 0,0, /* back_link, __blh */ \ + 0, /* esp0 */ \ + __KERNEL_DS, 0, /* ss0 */ \ + 0,0,0,0,0,0, /* stack1, stack2 */ \ + 0, /* cr3 */ \ + (unsigned long)double_fault_task, /* eip */ \ + 0, /* eflags */ \ + 0,0,0,0, /* eax,ecx,edx,ebx */ \ + 0, /* esp : lazy initializing */ \ + 0,0,0, /* ebp,esi,edi */ \ + __KERNEL_DS,0, /* es */ \ + __KERNEL_CS,0, /* cs */ \ + __KERNEL_DS,0, /* ss */ \ + __KERNEL_DS,0, /* ds */ \ + __KERNEL_DS,0, /* fs */ \ + __KERNEL_DS,0, /* gs */ \ + GDT_ENTRY_LDT,0, /* ldt */ \ + 0, INVALID_IO_BITMAP_OFFSET, /* tace, bitmap */ \ + {~0, } /* ioperm */ \ +} +#endif + +#ifndef CONFIG_KERNEL_MODE_LINUX +#define start_thread(regs, new_eip, new_esp) do { \ + __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \ + set_fs(USER_DS); \ + regs->xds = __USER_DS; \ + regs->xes = __USER_DS; \ + regs->xss = __USER_DS; \ + regs->xcs = __USER_CS; \ + regs->eip = new_eip; \ + regs->esp = new_esp; \ +} while (0) +#else #define start_thread(regs, new_eip, new_esp) do { \ __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \ set_fs(USER_DS); \ @@ -421,7 +471,21 @@ regs->xcs = __USER_CS; \ regs->eip = new_eip; \ regs->esp = new_esp; \ + regs->xcs &= 0x0000ffff; \ +} while (0) + +#define start_kernel_thread(regs, new_eip, new_esp) do { \ + __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \ + set_fs(KERNEL_DS); \ + regs->xds = __KERNEL_DS; \ + regs->xes = __KERNEL_DS; \ + regs->xss = __KERNEL_DS; \ + regs->xcs = __KERNEL_CS; \ + regs->eip = new_eip; \ + regs->esp = new_esp; \ + regs->xcs |= 0xffff0000; \ } while (0) +#endif /* Forward declaration, a strange C thing */ struct task_struct; diff -urN linux-2.5.49.orig/include/asm-i386/ptrace.h linux-2.5.49/include/asm-i386/ptrace.h --- linux-2.5.49.orig/include/asm-i386/ptrace.h Sat Nov 23 06:40:18 2002 +++ linux-2.5.49/include/asm-i386/ptrace.h Sun Nov 24 01:47:01 2002 @@ -52,7 +52,11 @@ #define PTRACE_OLDSETOPTIONS 21 #ifdef __KERNEL__ +#ifndef CONFIG_KERNEL_MODE_LINUX #define user_mode(regs) ((VM_MASK & (regs)->eflags) || (3 & (regs)->xcs)) +#else +#define user_mode(regs) ((VM_MASK & (regs)->eflags) || (0xffff0003 & (regs)->xcs)) +#endif #define instruction_pointer(regs) ((regs)->eip) #endif diff -urN linux-2.5.49.orig/include/asm-i386/segment.h linux-2.5.49/include/asm-i386/segment.h --- linux-2.5.49.orig/include/asm-i386/segment.h Sat Nov 23 06:40:24 2002 +++ linux-2.5.49/include/asm-i386/segment.h Sun Nov 24 01:47:01 2002 @@ -35,6 +35,12 @@ * 21 - APM BIOS support * 22 - APM BIOS support * 23 - APM BIOS support +#ifdef CONFIG_KERNEL_MODE_LINUX + * 24 - Kernel Stack Location segment (KSL) + * 25 - Double Fault Handling Task (DFT) + * 26 - not used + * 27 - not used +#endif */ #define GDT_ENTRY_TLS_ENTRIES 3 #define GDT_ENTRY_TLS_MIN 6 @@ -62,10 +68,20 @@ #define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 4) #define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 9) +#ifdef CONFIG_KERNEL_MODE_LINUX +#define GDT_ENTRY_KSL (GDT_ENTRY_KERNEL_BASE + 12) +#define __KERNELSTACK_DS (GDT_ENTRY_KSL * 8) +#define GDT_ENTRY_DFT (GDT_ENTRY_KERNEL_BASE + 13) +#endif + /* * The GDT has 21 entries but we pad it to cacheline boundary: */ +#ifndef CONFIG_KERNEL_MODE_LINUX #define GDT_ENTRIES 24 +#else +#define GDT_ENTRIES 28 +#endif #define GDT_SIZE (GDT_ENTRIES * 8) @@ -75,5 +91,9 @@ * of tasks we can have.. */ #define IDT_ENTRIES 256 + +#ifdef CONFIG_KERNEL_MODE_LINUX +#define __SW_KERNEL_CS (0xffff0000 | __KERNEL_CS) +#endif #endif diff -urN linux-2.5.49.orig/include/asm-i386/sigcontext.h linux-2.5.49/include/asm-i386/sigcontext.h --- linux-2.5.49.orig/include/asm-i386/sigcontext.h Sat Nov 23 06:40:24 2002 +++ linux-2.5.49/include/asm-i386/sigcontext.h Sun Nov 24 01:47:01 2002 @@ -70,7 +70,11 @@ unsigned long trapno; unsigned long err; unsigned long eip; +#ifndef CONFIG_KERNEL_MODE_LINUX unsigned short cs, __csh; +#else + unsigned long xcs; +#endif unsigned long eflags; unsigned long esp_at_signal; unsigned short ss, __ssh; diff -urN linux-2.5.49.orig/include/linux/mm.h linux-2.5.49/include/linux/mm.h --- linux-2.5.49.orig/include/linux/mm.h Sat Nov 23 06:40:14 2002 +++ linux-2.5.49/include/linux/mm.h Sun Nov 24 01:47:01 2002 @@ -546,6 +546,10 @@ extern unsigned long get_page_cache_size(void); extern unsigned int nr_used_zone_pages(void); +#ifdef CONFIG_KERNEL_MODE_LINUX +extern asmlinkage int address_presents_and_writable(unsigned long address); +#endif + #endif /* __KERNEL__ */ #endif diff -urN linux-2.5.49.orig/mm/memory.c linux-2.5.49/mm/memory.c --- linux-2.5.49.orig/mm/memory.c Sat Nov 23 06:40:41 2002 +++ linux-2.5.49/mm/memory.c Sun Nov 24 01:47:01 2002 @@ -1351,3 +1351,62 @@ } return page; } + +#ifdef CONFIG_KERNEL_MODE_LINUX +static inline int address_presents_and_writable_in_pmd(pmd_t* pmd, unsigned long address) +{ + pte_t* pte; + int result; + + if (pmd_none(*pmd)) + return 0; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + return 0; + } + if (!pmd_present(*pmd)) + return 0; + pte = pte_offset_map(pmd, address); + result = (pte_present(*pte) && pte_write(*pte)); + pte_unmap(pte); + return result; +} + +static inline int address_presents_and_writable_in_pgd(pgd_t* pgd, unsigned long address) +{ + pmd_t* pmd; + + if (pgd_none(*pgd)) + return 0; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + return 0; + } + if (!pgd_present(*pgd)) + return 0; + pmd = pmd_offset(pgd, address); + return address_presents_and_writable_in_pmd(pmd, address); +} + +static inline int address_presents_and_writable_in_mm(struct mm_struct* mm, unsigned long address) +{ + pgd_t* pgd; + + pgd = pgd_offset(mm, address); + return address_presents_and_writable_in_pgd(pgd, address); +} + +asmlinkage int address_presents_and_writable(unsigned long address) +{ + struct mm_struct* mm; + int result; + + mm = current->mm; + if (!mm) + return 0; + spin_lock(&mm->page_table_lock); + result = address_presents_and_writable_in_mm(mm, address); + spin_unlock(&mm->page_table_lock); + return result; +} +#endif