MP09

Run Fast

(revised 02NOV2006)


Introduction

In this exercise you will improve the performance of a C function that has been translated into an assembly language function and compare the performance of the implementations. In addition, you will perform comparisons to compiler optimized code.

The Function in C

The folks at Numerical Recipes Software provide a function called ran0 that generates uniformally distributed random numbers. The code is provided below:

#define IA 16807
#define IM 2147483647
#define AM (1.0/IM)
#define IQ 127773
#define IR 2836
#define MASK 123459876

float ran0(long *idum)
{
	long k;
	float ans;

	*idum ^= MASK;
	k=(*idum)/IQ;
	*idum=IA*(*idum-k*IQ)-IR*k;
	if (*idum < 0) *idum += IM;
	ans=AM*(*idum);
	*idum ^= MASK;
	return ans;
}
#undef IA
#undef IM
#undef AM
#undef IQ
#undef IR
#undef MASK

A detailed description of the nature of the generator is provided here.

The Function in ASM

TITLE AsmRan0 Procedure      (AsmRan0.asm)

; An assembly language version of the Numerical recipes rano 
; random number generator

.586
.model flat,C

AsmRan0 PROTO,
	idum:PTR DWORD
	
	; program constants
	IA = 16807
	IM = 2147483647
	IQ = 127773
	IR = 2836
	_MASK = 123459876

.data
	AM REAL8 4.656612875E-10	; (1.0/IM)

.code
;-----------------------------------------------
AsmRan0 PROC C USES edi, idum:PTR DWORD
LOCAL k:DWORD, temp:DWORD, ans:DWORD	
;
; Generate random deviates over the interval (0,1)
;-----------------------------------------------
	push	esi

	; *idum ^= mask
	mov	esi, idum			; get pointer to idum
	mov	eax, [esi]			; get idum
	xor	eax, _MASK			; idum ^ mask
	mov [esi], eax			; idum = idum ^ mask

	; k=(*idum)/IQ;
	mov	eax, [esi]			; get idum
	cdq						; extend the sign	
	mov ecx, IQ				; get IQ
	idiv ecx				; (*idum)/IQ;
	mov k, eax				; k=(*idum)/IQ;

	; *idum=IA*(*idum-k*IQ)-IR*k;
	mov	eax, k				; get k
	imul eax, IQ			; k*IQ
	mov	edx, [esi]			; get idum
	sub	edx, eax			; *idum-k*IQ
	imul edx, IA			; IA*(*idum-k*IQ)
	mov	eax, k				; get k
	imul eax, IR			; IR*k
	sub	edx, eax			; IA*(*idum-k*IQ)-IR*k
	mov [esi], edx			; *idum=IA*(*idum-k*IQ)-IR*k
	
	; if (*idum < 0) *idum += IM;
	mov	eax, [esi]			; get idum
	cmp	eax, 0				; if (*idum < 0)
	jge	Label1
	mov	eax, [esi]			; get idum  
	add	eax, IM				; *idum + IM
	mov [esi], eax			; *idum += IM
Label1:
	
	; ans=AM*(*idum);
	mov	eax, [esi]			; get idum
	mov temp, eax			; save idum for the conversion
	fild temp				; convert idum to real
	fmul AM					; AM*(*idum)
	fstp ans				; ans=AM*(*idum)
	
	; *idum ^= MASK;
	mov	eax, [esi]			; get idum
	xor	eax, _MASK			; idum ^ mask
	mov [esi], eax			; idum = idum ^ mask

	; return ans;
	fld	ans
	
	pop	esi

	ret	0
AsmRan0 ENDP

END

The Comparison Program

// main.cpp - Testing ran0 and AsmRan0

#include <iostream>
#include <time.h>
#include "ran0.h"
#include <Windows.h>
using namespace std;

int main()
{
	// Fill an array with pseudorandom integers.
	const unsigned LOOP_SIZE = 10000000;
	long idum;
	LARGE_INTEGER start, end;
	LARGE_INTEGER freq;

	SetThreadAffinityMask(GetCurrentThread(), 1);

	// make sure installed hardware supports a high-resolution performance counter
	if(!QueryPerformanceFrequency(&freq)) return 1; 


	// Test the C++ function:

	QueryPerformanceCounter(&start);	
	idum = 123456;
	for( int n = 0; n < LOOP_SIZE; n++) ran0(&idum);
	QueryPerformanceCounter(&end);
	
	cout << " c code " << (end.QuadPart - start.QuadPart)/(double) freq.QuadPart 
		<< " seconds" << endl;

	// Test the Assembly language procedure:

	QueryPerformanceCounter(&start);	
	idum = 123456;
	for( int n = 0; n < LOOP_SIZE; n++) AsmRan0(&idum);
	QueryPerformanceCounter(&end);

	cout << " asm code " << (end.QuadPart - start.QuadPart)/(double) freq.QuadPart 
		<< " seconds" << endl;

	return 0;
}

The Header File

// ran0.h

extern "C" {
// C++ version
float ran0(long *idum);

// Assembly language version
float AsmRan0(long *idum);
}

The Task

Your task is:

  1. Verify that the function is working correctly by comparing the output of the .asm and .c functions.
  2. Improve the performance of AsmRan0 by modifying the assembly language implementation given above.
  3. Using a framework simular to the comparison program (given above), compare performance of the .asm and .c functions using the original implementation (the one provided). Select a problem size so that the total run time of the C-callable function is not less than 15 seconds on your computer.
  4. Using a framework simular to the comparison program (given above), compare performance of the .asm and .c functions using your revised (improved) implementation. Use the same problem size as item 3 above.
  5. Using a framework simular to the comparison program (given above), compare performance of the .asm and .c functions using your revised (improved) implementation using a optimized compilation mode. Use the same problem size as item 3 above.
  6. Use the Msinfo32 system call to get the specifics on the processor in your computer.

What You'll Turn In

Using a zip-utility, zip up the following items into a single archive (call it mp08.zip) and submit it to the homework submission system.

  1. The Visual Studio project you created including the mp09.asm code you used.
  2. A comparison of the output from the .asm and .c functions to verify the functions are working correctly.
  3. Performance results for the original comparison.
  4. P{erformance results of your revised implementation.
  5. Performance results for the optimized comparison.
  6. The description from Msinfo32 of your processor.
  7. Items 2,3,4, 5 and 6 should be placed in a file called performance_results.txt.


* based on exercises from Irvine, 5e.