1 2 3 4 | # opcontrol --setup --ctr0-event=CPU_CLK_UNHALTED --ctr0-count=600000 --vmlinux=/usr/src/linux-2.4.20/vmlinux For RTC mode users, use --rtc-value=2048 # opcontrol --start |
1 | # opcontrol --stop/--shutdown/--dump |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | /* * Shared data being modified by two threads running on different CPUs. */ /* shared structure between two threads which will be optimized later*/ struct shared_data_align { unsigned int num_proc1; unsigned int num_proc2; }; /* * Shared structure between two threads remains unchanged (non optimized) * This is required in order to collect some samples for the L2_LINES_IN event. */ struct shared_data_nonalign { unsigned int num_proc1; unsigned int num_proc2; }; /* * In the example program below, the parent process creates a clone * thread sharing its memory space. The parent thread running on one CPU * increments the num_proc1 element of the common and common_aln. The cloned * thread running on another CPU increments the value of num_proc2 element of * the common and common_aln structure. */ /* Declare global data */ struct shared_data_nonalign common_aln; /*Declare local shared data */ struct shared_data_align common; /* Now clone a thread sharing memory space with the parent process */ if ((pid = clone(func1, buff+8188, CLONE_VM, &common)) < 0) { perror("clone"); exit(1); } /* Increment the value of num_proc1 in loop */ for (j = 0; j < 200; j++) for(i = 0; i < 100000; i++) { common.num_proc1++; } /* Increment the value of num_proc1 in loop */ for (j = 0; j < 200; j++) for(i = 0; i < 100000; i++) { common_aln.num_proc1++; } /* * The routine below is called by the cloned thread, to increment the num_proc2 * element of common and common_aln structure in loop. */ int func1(struct shared_data_align *com) { int i, j; /* Increment the value of num_proc2 in loop */ for (j = 0; j < 200; j++) for (i = 0; i < 100000; i++) { com->num_proc2++; } /* Increment the value of num_proc2 in loop */ for (j = 0; j < 200; j++) for (i = 0; i < 100000; i++) { common_aln.num_proc2++; } } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | # opcontrol --setup --ctr0-event=L2_LINES_IN --ctr0-count=500 --vmlinux=/usr/src/linux-2.4.20/vmlinux #opcontrol --start #./appln #opcontrol --stop #oprofpp -l ./appln Cpu type: PIII Cpu speed was (MHz estimation) : 699.57 Counter 0 counted L2_LINES_IN events (number of allocated lines in L2) with a unit mask of 0x00 (No unit mask) count 500 vma samples % symbol name 080483d0 0 0 _start 080483f4 0 0 call_gmon_start 08048420 0 0 __do_global_dtors_aux 08048480 0 0 fini_dummy 08048490 0 0 frame_dummy 080484c0 0 0 init_dummy 08048630 0 0 __do_global_ctors_aux 08048660 0 0 init_dummy 08048670 0 0 _fini 080484d0 4107 49.2033 main 080485b8 4240 50.7967 func1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | #oprofpp -l ./appln Cpu type: PIII Cpu speed was (MHz estimation) : 699.667 Counter 0 counted CPU_CLK_UNHALTED events (clocks processor is not halted) with a unit mask of 0x00 (No unit mask) count 10000 vma samples % symbol name 080483d0 0 0 _start 080483f4 0 0 call_gmon_start 08048420 0 0 __do_global_dtors_aux 08048480 0 0 fini_dummy 08048490 0 0 frame_dummy 080484c0 0 0 init_dummy 08048640 0 0 __do_global_ctors_aux 08048670 0 0 init_dummy 08048680 0 0 _fini 080484d0 40317 49.9356 main 080485bc 40421 50.0644 func1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | /* * The padding is added to separate the two unsigned ints in such a * way that the two elements num_proc1 and num_proc2 are on two * different cache lines. */ struct shared_data_align { unsigned int num_proc1; char padding[28]; unsigned int num_proc2; }; /* * This structure remains unchanged, so that some cache lines * read in can be seen in profile data. */ struct shared_data_nonalign { unsigned int num_proc1; unsigned int num_proc2; }; |
欢迎光临 电子技术论坛_中国专业的电子工程师学习交流社区-中电网技术论坛 (http://bbs.eccn.com/) | Powered by Discuz! 7.0.0 |