-------------------------------------------------------
--! @file integerCombinatoryMultiplier.vhd
--! @brief N bits integer multiplier
--! @author Bruno Albertini (balbertini@usp.br)
--! @date 2017-11-14
-------------------------------------------------------

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

--! N bit integer multiplier
entity integerCombinatoryMultiplier is
  generic (
    --! Multiplicands width
    --! Attention: Dadda and Wallace are fixed to 4 bits
    n: natural := 4
  );
  port (
    a, b: in  std_logic_vector(n-1 downto 0);
    r   : out std_logic_vector(2*n-1 downto 0)
  );
end entity;

--! Structural 4 bits multiplier, Wallace tree reduction
--! Wallace tree reduces the operands as soon as possible, using the biggest
--! adder possible (full or half adder)
--! For each column reduce the operands using the biggest adder possible, and that's it
architecture wallace of integerCombinatoryMultiplier is
  --! Full Adder
  component fa is
    port (
      a, b, ci : in  std_logic;
      r, co : out std_logic
    );
  end component;
  --! Half Adder
  component ha is
    port (
      a, b : in  std_logic;
      r, co : out std_logic
    );
  end component;

  --! Array to store partial products
  type ppt is array (3 downto 0, 3 downto 0) of std_logic;
  signal pp: ppt;
  --! Temporary sums and carries
  signal ha11co, fa12s, fa12co, fa13s, fa13co, fa14s, fa14co, ha15s, ha15co,
         ha22co, fa23s, fa23co, ha24s, ha24co, ha25s, ha25co, ha26s, ha26co,
         haf3co, faf4co,faf5co, faf6co, haf7co: std_logic;
begin
  --! If the design is used for anything but 4 bit refuse to operate
  assert n=4 report "This is a fixed 4 bit implementation." severity failure;

  --! Partial products generation
  genppi: for i in 0 to 3 generate
  begin
    geppj: for j in 0 to 3 generate
    begin
      pp(i,j) <= a(i) and b(j);
    end generate;
  end generate;

  --! First stage (reduction)
  r(0) <= pp(0,0);
  ha11: ha port map (pp(1,0), pp(0,1), r(1), ha11co);
  fa12: fa port map (pp(2,0), pp(1,1), pp(0,2), fa12s, fa12co);
  fa13: fa port map (pp(3,0), pp(2,1), pp(1,2), fa13s, fa13co);
  fa14: fa port map (pp(3,1), pp(2,2), pp(1,3), fa14s, fa14co);
  ha15: ha port map (pp(3,2), pp(2,3), ha15s, ha15co);

  --! Second stage (reduction)
  ha22: ha port map (ha11co, fa12s, r(2), ha22co);
  fa23: fa port map (fa12co, fa13s, pp(0,3), fa23s, fa23co);
  ha24: ha port map (fa13co, fa14s, ha24s, ha24co);
  ha25: ha port map (fa14co, ha15s, ha25s, ha25co);
  ha26: ha port map (ha15co, pp(3,3), ha26s, ha26co);

  --! Final stage (simple sum)
  haf3: ha port map (ha22co, fa23s, r(3), haf3co);
  faf4: fa port map (fa23co, ha24s, haf3co, r(4), faf4co);
  faf5: fa port map (ha24co, ha25s, faf4co, r(5), faf5co);
  faf6: fa port map (ha25co, ha26s, faf5co, r(6), faf6co);
  haf7: ha port map (ha26co, faf6co, r(7), haf7co);

end architecture;


--! Structural 4 bits multiplier, Dadda tree reduction
--! Dadda tree postpone operand reduction and uses the smallest adder as possile
--! n = size of operands (is different, the minimum between them)
--! Start: find i such as di < n, di = {d1=2, d(i+1)=1.5*di}
--! Reduction step: for each collum of bits, check height:
--!   h <= di : do nothing and copy elements to next reduction step
--!   h = di + 1 : use a half adder on top two elements, sum to next step, co to next column
--!   h > di + 1 : use a full adder on top three elements, sum to next step, co to next column
--! Note that adders results are used on next step but should count on height immediatelly
architecture dadda of integerCombinatoryMultiplier is
  --! Full Adder
  component fa is
    port (
      a, b, ci : in  std_logic;
      r, co : out std_logic
    );
  end component;
  --! Half Adder
  component ha is
    port (
      a, b : in  std_logic;
      r, co : out std_logic
    );
  end component;

  --! Array to store partial products
  type ppt is array (3 downto 0, 3 downto 0) of std_logic;
  signal pp: ppt;
  --! Temporary sums and carries
  signal ha13s, ha13co, ha14s, ha14co,
         ha22s, ha22co, fa23s, fa23co, fa24s, fa24co, fa25s, fa25co,
         haf1co, faf2co, faf3co, faf4co, faf5co: std_logic;
begin
  --! If the design is used for anything but 4 bit refuse to operate
  assert n=4 report "This is a fixed 4 bit implementation." severity failure;

  --! Partial products generation
  genppi: for i in 0 to 3 generate
  begin
    geppj: for j in 0 to 3 generate
    begin
      pp(i,j) <= a(i) and b(j);
    end generate;
  end generate;

  --! First stage (reduction, i=2, di=3)
  r(0) <= pp(0,0);
  ha13: ha port map (pp(3,0), pp(2,1), ha13s, ha13co);
  ha14: ha port map (pp(3,1), pp(2,2), ha14s, ha14co);

  --! Second stage (reduction, i=1, di=2)
  ha22: ha port map (pp(2,0), pp(1,1), ha22s, ha22co);
  fa23: fa port map (ha13s, pp(1,2), pp(0,3), fa23s, fa23co);
  fa24: fa port map (ha14s, pp(1,3), ha13co, fa24s, fa24co);
  fa25: fa port map (pp(3,2), pp(2,3), ha14co, fa25s, fa25co);

  --! Final stage (simple sum)
  haf1: ha port map (pp(1,0), pp(0,1), r(1), haf1co);
  faf2: fa port map (ha22s, pp(0,2), haf1co, r(2), faf2co);
  faf3: fa port map (fa23s, ha22co, faf2co, r(3), faf3co);
  faf4: fa port map (fa24s, fa23co, faf3co, r(4), faf4co);
  faf5: fa port map (fa25s, fa24co, faf4co, r(5), faf5co);
  faf6: fa port map (pp(3,3), fa25co, faf5co, r(6), r(7));

end architecture;

--! Functional N bits multiplier (unsigned integer)
architecture functional of integerCombinatoryMultiplier is
begin
  r <= std_logic_vector(unsigned(a) * unsigned(b));
end architecture;