+-----------------------------------------------------------+
|                                                           |
| The March 10, 1986 ELECTRONICS Magazine says a 1024 point |
| FFT Benchmark was performed in 5 milliseconds on the      |
| DSP56000. Perhaps you could elaborate at what clock speed |
| this was performed and other details, such as what the    |
| memory map looks like for that benchmark. The reason I    |
| ask, is that in such application, the internal memory     |
| address space is exceeded and two operands need to be     |
| fetched from external memory port. Is there some way this |
| can be avoided so that the maximum bandwith of the part   |
| can be used?                                              |
|                                                           |
+-----------------------------------------------------------+

* Note: The Dr. BuB DSP Software library now contains   *
*       several additional FFT benchmarks, including a  *
*       3.39ms 1024 point FFT!                          *

Answer:
       I will answer your question in 3 parts.

a)  f(clk) = 20.5 MHz

b) The code for a Radix 2 Decimation in Time In-Place
   FFT routine is given below:


fft_r2b macro   points,data,coef 
fft_r2b ident   1,1 
; 
; Radix 2 Decimation in Time In-Place Fast Fourier Transform 
; Routine 
; 
;    Complex input and output data 
;        Real data in X memory 
;        Imaginary data in Y memory 
;    Normally ordered input data 
;    Bit reversed output data 
;       Coefficient lookup table 
;        -Cosine values in X memory 
;        -Sine values in Y memory 
; 
; Macro Call - fft_r2b   points,data,coef 
; 
;       points     number of points (4-32768, power of 2) 
;       data       start of data buffer 
;       coef       start of sine/cosine table 
; 
; Alters Data ALU Registers 
;       x1      x0      y1      y0 
;       a2      a1      a0      a 
;       b2      b1      b0      b 
; 
; Alters Address Registers 
;       r0      n0      m0 
;       r1      n1      m1 
;               n2 
; 
;       r4      n4      m4 
;       r5      n5      m5 
;       r6      n6      m6 
; 
; Alters Program Control Registers 
;       pc      sr 
; 
; Uses 6 locations on System Stack 
; 
; Latest Revision - September 30, 1986 
; Tested and verified - October 2, 1986 
; 
        move    #points/2,n0    ;initialize butterflies per group
        move    #1,n2           ;initialize groups per pass 
        move    #points/4,n6    ;initialize coefficient offset 
        move    #-1,m0          ;initialize address modifiers 
        move    m0,m1           ;for linear addressing 
        move    m0,m4 
        move    m0,m5 
        move    #0,m6           ;initialize coefficient address
                                ;modifier for reverse carry (bit 
                                ;reversed) addressing 
; 
; Do all FFT passes but last pass 
; 
        do      #@cvi(@log(points)/@log(2)-0.5),_end_pass 
        move    #data,r0        ;initialize A input pointer 
        move    r0,r4           ;initialize A output pointer 
        lua     (r0)+n0,r1      ;initialize B input pointer 
        move    #coef,r6        ;initialize C input pointer 
        lua     (r1)-,r5        ;initialize B output pointer 
        move    n0,n1           ;initialize pointer offsets 
        move    n0,n4 
        move    n0,n5 
  
        do      n2,_end_grp 
        move    x:(r1),x1       y:(r6),y0  ;lookup -sine value 
        move    x:(r5),a        y:(r0),b 
        move    x:(r6)+n6,x0               ;lookup -cosine value 
  
  
        do      n0,_end_bfy     ;Radix 2 DIT butterfly kernel
                                ;with constant twiddle factor  
        mac     x1,y0,b                         y:(r1)+,y1      
        macr    -x0,y1,b        a,x:(r5)+       y:(r0),a        
        subl    b,a             x:(r0),b        b,y:(r4) 
        mac     -x1,x0,b        x:(r0)+,a       a,y:(r5) 
        macr    -y1,y0,b        x:(r1),x1 
        subl    b,a             b,x:(r4)+       y:(r0),b 
_end_bfy 

        move    a,x:(r5)+n5     y:(r1)+n1,y1  ;update pntrs r1,
        move    x:(r0)+n0,x1    y:(r4)+n4,y1  ;r0 and r4 by dummy
                                              ;moves in x1 and y1
_end_grp 
        move    n0,b1 
        lsr     b       n2,a1  ;divide butterflies per group by 2
        lsl     a       b1,n0  ;multiply groups per pass by two 
        move            a1,n2 
_end_pass 
; 
; Do last FFT pass 
; 
        move    n1,n0           ;correct pointer offset for last
                                ;pass
        move    #data,r0        ;initialize A input pointer 
        move    r0,r4           ;initialize A output pointer 
        lua     (r0)+,r1        ;initialize B input pointer 
        move    #coef,r6        ;initialize C input pointer 
        lua     (r1)-n1,r5      ;initialize B output pointer 
        move    x:(r1),x1       y:(r6),y0 
        move    x:(r5),a        y:(r0),b 
  
        do      n2,_lastpass    ;Radix 2 DIT butterfly kernel 
                                ;with one butterfly per group 

        mac     x1,y0,b         x:(r6)+n6,x0    y:(r1)+n1,y1
        macr    -x0,y1,b        a,x:(r5)+n5     y:(r0),a        
        subl    b,a             x:(r0),b        b,y:(r4) 
        mac     -x1,x0,b        x:(r0)+n0,a     a,y:(r5) 
        macr    -y1,y0,b        x:(r1),x1       y:(r6),y0 
        subl    b,a             b,x:(r4)+n4     y:(r0),b 
_lastpass 
        move    a,x:(r5)+n5 
        endm 



The memory map for the Radix 2 Decimation in Time In-Place FFT,
fft_r2b, routine listed above is given below. 


#coef. +                         
#points -->|-----------|                   |-----------|
           | Half-Wave |                   | Half-Wave |
           | Negative  |<--(R6)+N6         | Negative  |
           | Cosine    |                   | Sine      |         
           | Table     |                   | Table     |
 #coef. -->|-----------|<--(R6)  #coef. -->|-----------|<-(R6)
#data +    |           |                   |           |
#points -->|-----------|                   |-----------|      
           |  Real     |                   |           |
           |  Data     |<--(R1)            | Imaginary |
           |           |<--(R5)            |   Data    |<-(R5)
           |           |                   |           |<-(R4)
 #data --->|___________|<--(R0)   #data -->|___________|<-(R0)
                                          
             X DATA                                    Y DATA
                    
                                              
where 
      #points = number of points ( 2-32768,power of 2 )
      #data   = starting address of data buffer
      #coef   = starting address of sine/cosine tables


Note: The starting location of the sine and cosine tables must
be multiples of #points/2, i.e. for the 1024-point FFT  #coef.
can be 0 or 512 or 1024 or 1536 etc., since the modulo/bit-
reversed addressing is used to access this table.
To be able to easily access the bit reversed output data the
starting location of the output, #data, should be on multiples of
#points, i.e. for the 1024-point FFT #data can be 0 or 1024 or
2048 etc.



The fft_r2b macro is not the fastest routine available, it is
however simple and requires less program memory than other
slightly faster routines that are currently available. For the
1024-point FFT case above where internal program memory is used
and #data=0 the time required is 5.03 ms.



c) For more information please refer to the following paper:
    K. Kloker, " The Motorola DSP56000 Digital Signal Processor",
    IEEE MICRO, December 1986, pp. 29-48.                       
