Close

Simulation of 8-bit increment

A project log for TRCM

Attempt to reinvent the wheel of HDL

shaosSHAOS 08/17/2018 at 06:160 Comments

This is test3.cpp program:

#include <TRCMath.hpp>
#undef DEBUG

#include <time.h>
#include <stdio.h>
#include <stdlib.h>

using namespace std;

using namespace TRC;

class HalfAdder : public Entity
{
 protected:

// indecies:
  int iA,iB,iS,iC;

// inputs:
  Signal A,B;

// outputs:
  Signal S,C;

 public:

  HalfAdder(const char* s) : Entity(s)
  {

// empty constructor for generic unit

  }

  void step()
  {
    A = io(iA).read();
    B = io(iB).read();

    if(A==TRUE && B==TRUE)
       S = FALSE;
    else if(A==FALSE && B==TRUE)
       S = TRUE;
    else if(A==TRUE && B==FALSE)
       S = TRUE;
    else // if(A==FALSE && B==FALSE)
       S = FALSE;

    if(A==TRUE && B==TRUE)
       C = TRUE;
    else
       C = FALSE;

//    cout << name() << ":" << A << B << "->" << C << S << endl;

    io(iS) << S;
    io(iC) << C;
  }
};

class World : public Entity
{

// indecies:
  int i_increment,i_input,i_output,i_carry;

 public:

// internal counter:
  long long counter;

  World() : Entity("World")
  {
    i_increment = at("INC");
    i_input     = at("I",8);
    i_output    = at("O",8);
    i_carry     = at("COUT");

    counter = 0;
  }

  void step() // World works backwards - read outputs and write inputs
  {
    Wire<9> vec; // temporary vector
    vec[8] = io(i_output+0).read();
    vec[7] = io(i_output+1).read();
    vec[6] = io(i_output+2).read();
    vec[5] = io(i_output+3).read();
    vec[4] = io(i_output+4).read();
    vec[3] = io(i_output+5).read();
    vec[2] = io(i_output+6).read();
    vec[1] = io(i_output+7).read();
    vec[0] = io(i_carry).read();
#ifdef DEBUG
    cout << "Case " << counter << " output=" << vec.binarize() << endl;
#endif

    io(i_increment) << TRUE; /* below we change inputs every 16 steps */
    io(i_input+0)   << ((counter&(1<<4))?TRUE:FALSE);
    io(i_input+1)   << ((counter&(1<<5))?TRUE:FALSE);
    io(i_input+2)   << ((counter&(1<<6))?TRUE:FALSE);
    io(i_input+3)   << ((counter&(1<<7))?TRUE:FALSE);
    io(i_input+4)   << ((counter&(1<<8))?TRUE:FALSE);
    io(i_input+5)   << ((counter&(1<<9))?TRUE:FALSE);
    io(i_input+6)   << ((counter&(1<<10))?TRUE:FALSE);
    io(i_input+7)   << ((counter&(1<<11))?TRUE:FALSE);

    counter++;
  }

};

unsigned char BYTE = 0;

int main()
{
  System *sys = System::getInstance();

  World world;

  INSTANCE(HalfAdder,0);
      iA = at("INC");
      iB = at("I[0]");
      iS = at("O[0]");
      iC = at("C0");
  NAMED(ha0);

  INSTANCE(HalfAdder,1);
      iA = at("C0");
      iB = at("I[1]");
      iS = at("O[1]");
      iC = at("C1");
  NAMED(ha1);

  INSTANCE(HalfAdder,2);
      iA = at("C1");
      iB = at("I[2]");
      iS = at("O[2]");
      iC = at("C2");
  NAMED(ha2);

  INSTANCE(HalfAdder,3);
      iA = at("C2");
      iB = at("I[3]");
      iS = at("O[3]");
      iC = at("C3");
  NAMED(ha3);

  INSTANCE(HalfAdder,4);
      iA = at("C3");
      iB = at("I[4]");
      iS = at("O[4]");
      iC = at("C4");
  NAMED(ha4);

  INSTANCE(HalfAdder,5);
      iA = at("C4");
      iB = at("I[5]");
      iS = at("O[5]");
      iC = at("C5");
  NAMED(ha5);

  INSTANCE(HalfAdder,6);
      iA = at("C5");
      iB = at("I[6]");
      iS = at("O[6]");
      iC = at("C6");
  NAMED(ha6);

  INSTANCE(HalfAdder,7);
      iA = at("C6");
      iB = at("I[7]");
      iS = at("O[7]");
      iC = at("COUT");
  NAMED(ha7);

unsigned long t1,t2;
 int i,n = 100000000;
 t1 = clock();
 for(i=0;i<n;i++)
 {
    BYTE++;
    BYTE++;
    BYTE++;
    BYTE++;
    BYTE++;
    BYTE++;
    BYTE++;
    BYTE++;
    BYTE++;
    BYTE++;
 }
 t2 = clock();
 printf("BYTE=0x%2.2X (%6.6fs or %2.2fns per increment)\n",BYTE,
        (double)(t2-t1)/CLOCKS_PER_SEC,
        (double)(t2-t1)/(n/1e8)/CLOCKS_PER_SEC
        /* 1e8 because we have 10 increments per iteration */
        );

  n = 0x100000;
  t1 = clock();
  while(world.counter!=n)
  {
    sys->prepare();
    ha0.step();
    ha1.step();
    ha2.step();
    ha3.step();
    ha4.step();
    ha5.step();
    ha6.step();
    ha7.step();
    world.step();
  }
  t2 = clock();

  printf("%4.4fs or %2.2fns per step\n",
         (double)(t2-t1)/CLOCKS_PER_SEC,
         (double)(t2-t1)/(n/1e9)/CLOCKS_PER_SEC
        );
}

Output:

INC <- World (idx=0)
I[0] <- World (idx=1)
I[1] <- World (idx=2)
I[2] <- World (idx=3)
I[3] <- World (idx=4)
I[4] <- World (idx=5)
I[5] <- World (idx=6)
I[6] <- World (idx=7)
I[7] <- World (idx=8)
O[0] <- World (idx=9)
O[1] <- World (idx=10)
O[2] <- World (idx=11)
O[3] <- World (idx=12)
O[4] <- World (idx=13)
O[5] <- World (idx=14)
O[6] <- World (idx=15)
O[7] <- World (idx=16)
COUT <- World (idx=17)
INC <- HalfAdder0 (idx=0)
I[0] <- HalfAdder0 (idx=1)
O[0] <- HalfAdder0 (idx=9)
C0 <- HalfAdder0 (idx=18)
C0 <- HalfAdder1 (idx=18)
I[1] <- HalfAdder1 (idx=2)
O[1] <- HalfAdder1 (idx=10)
C1 <- HalfAdder1 (idx=19)
C1 <- HalfAdder2 (idx=19)
I[2] <- HalfAdder2 (idx=3)
O[2] <- HalfAdder2 (idx=11)
C2 <- HalfAdder2 (idx=20)
C2 <- HalfAdder3 (idx=20)
I[3] <- HalfAdder3 (idx=4)
O[3] <- HalfAdder3 (idx=12)
C3 <- HalfAdder3 (idx=21)
C3 <- HalfAdder4 (idx=21)
I[4] <- HalfAdder4 (idx=5)
O[4] <- HalfAdder4 (idx=13)
C4 <- HalfAdder4 (idx=22)
C4 <- HalfAdder5 (idx=22)
I[5] <- HalfAdder5 (idx=6)
O[5] <- HalfAdder5 (idx=14)
C5 <- HalfAdder5 (idx=23)
C5 <- HalfAdder6 (idx=23)
I[6] <- HalfAdder6 (idx=7)
O[6] <- HalfAdder6 (idx=15)
C6 <- HalfAdder6 (idx=24)
C6 <- HalfAdder7 (idx=24)
I[7] <- HalfAdder7 (idx=8)
O[7] <- HalfAdder7 (idx=16)
COUT <- HalfAdder7 (idx=17)
BYTE=0x00 (2.308623s or 2.31ns per increment)
1.7257s or 1645.74ns per step

Here it's also measuring performance to compare with native increment of BYTE (unsigned char) that was 2.31ns per increment on my Linux AMD64 machine. Because in worst case propagation delay for carry may take up to 8 steps of simulation (below is partial DEBUG output with switch from 011111111 to 100000000 binary or 255+1=256 where 9th bit is overflow bit):

...
Case 1048561 output=011111111
Case 1048562 output=011111110
Case 1048563 output=011111100
Case 1048564 output=011111000
Case 1048565 output=011110000
Case 1048566 output=011100000
Case 1048567 output=011000000
Case 1048568 output=010000000
Case 1048569 output=100000000
...

we may tell that 1 increment takes 8 steps of simulations so 1646ns per step is 13168ns per byte increment or 5700 times slower of native BYTE increment, but if we try optimizations options we get:

-O1:
0.2054s or 195.87ns per step
-O2:
0.2087s or 199.00ns per step
-O3:
0.1822s or 173.73ns per step

This is a little better - 600 times slower :)

From other point of view it is about 720 thousands 8-bit increments per second! Fully emulated bit by bit ;)

Or 5.78 millions steps of simulation of 18 connection points (so it's about 100 millions connection points per second per core and potentially it could be easily parallelized to utilize all cores of your PC)...

Discussions