diff --git a/xdma/axi_bram.sv b/xdma/ch_2/axi_bram.sv similarity index 97% rename from xdma/axi_bram.sv rename to xdma/ch_2/axi_bram.sv index 200cbfc..f18536f 100644 --- a/xdma/axi_bram.sv +++ b/xdma/ch_2/axi_bram.sv @@ -61,10 +61,10 @@ reg [AXI_IDWIDTH-1:0] rid = '0; // 从 读地址 传过来的id reg [ 7:0] rcount = '0; // 从 读地址 传过来的读取次数 reg [ MEM_AWIDTH-1:0] mem_raddr, mem_raddr_last; // 当前操作的地址, 和上次操作的地址 -assign s_axi_arready = (rstate == R_IDLE); // 读地址 可以进行 接收 -assign s_axi_rvalid = (rstate == R_BUSY); // 读数据 以准备就绪 +assign s_axi_arready = (rstate == R_IDLE); // 空闲 读地址 可以进行 接收 +assign s_axi_rvalid = (rstate == R_BUSY); // 正在传送数据 读数据 以准备就绪 assign s_axi_rlast = (rstate == R_BUSY) && (rcount == 8'd0); // 是否是最后一个数据, 直接连线 -assign s_axi_rid = rid; // 直接连线 +assign s_axi_rid = rid; // 直接连线, 反馈给 M assign s_axi_rresp = '0; always @ (posedge clk or negedge rstn) diff --git a/xdma/fpga_top.sv b/xdma/ch_2/fpga_top.sv similarity index 100% rename from xdma/fpga_top.sv rename to xdma/ch_2/fpga_top.sv diff --git a/xdma/xdma_rw.c b/xdma/ch_2/xdma_rw.c similarity index 100% rename from xdma/xdma_rw.c rename to xdma/ch_2/xdma_rw.c diff --git a/xdma/ch_3/axi_mpeg2encoder_wrapper.sv b/xdma/ch_3/axi_mpeg2encoder_wrapper.sv new file mode 100644 index 0000000..0f0c44f --- /dev/null +++ b/xdma/ch_3/axi_mpeg2encoder_wrapper.sv @@ -0,0 +1,287 @@ + +module axi_mpeg2encoder_wrapper #( + parameter AXI_IDWIDTH = 4 +) ( + input wire rstn, + input wire clk, + // AXI-MM AW interface ---------------------------------------------------- + output wire s_axi_awready, + input wire s_axi_awvalid, + input wire [ 63:0] s_axi_awaddr, + input wire [ 7:0] s_axi_awlen, + input wire [AXI_IDWIDTH-1:0] s_axi_awid, + // AXI-MM W interface ---------------------------------------------------- + output wire s_axi_wready, + input wire s_axi_wvalid, + input wire s_axi_wlast, + input wire [ 63:0] s_axi_wdata, + // AXI-MM B interface ---------------------------------------------------- + input wire s_axi_bready, + output wire s_axi_bvalid, + output wire [AXI_IDWIDTH-1:0] s_axi_bid, + output wire [ 1:0] s_axi_bresp, + // AXI-MM AR interface ---------------------------------------------------- + output wire s_axi_arready, + input wire s_axi_arvalid, + input wire [ 63:0] s_axi_araddr, + input wire [ 7:0] s_axi_arlen, + input wire [AXI_IDWIDTH-1:0] s_axi_arid, + // AXI-MM R interface ---------------------------------------------------- + input wire s_axi_rready, + output wire s_axi_rvalid, + output wire s_axi_rlast, + output reg [ 63:0] s_axi_rdata, + output wire [AXI_IDWIDTH-1:0] s_axi_rid, + output wire [ 1:0] s_axi_rresp +); + + + + +// --------------------------------------------------------------------------------------- +// AXI READ state machine +// --------------------------------------------------------------------------------------- + +enum reg [0:0] {R_IDLE, R_BUSY} rstate = R_IDLE; + +reg [AXI_IDWIDTH-1:0] rid = '0; +reg [ 7:0] rcount = '0; +reg [ 63-3:0] raddr_63_3, raddr_63_3_r; +wire [ 63:0] raddr = {raddr_63_3 , 3'h0}; +wire [ 63:0] raddr_r = {raddr_63_3_r, 3'h0}; + +assign s_axi_arready = (rstate == R_IDLE); +assign s_axi_rvalid = (rstate == R_BUSY); +assign s_axi_rlast = (rstate == R_BUSY) && (rcount == 8'd0); +assign s_axi_rid = rid; +assign s_axi_rresp = '0; + +always @ (posedge clk or negedge rstn) + if (~rstn) begin + rstate <= R_IDLE; + rid <= '0; + rcount <= '0; + end else begin + case (rstate) + R_IDLE : + if (s_axi_arvalid) begin + rstate <= R_BUSY; + rid <= s_axi_arid; + rcount <= s_axi_arlen; + end + R_BUSY : + if (s_axi_rready) begin + if (rcount == 8'd0) // the last data of read session + rstate <= R_IDLE; + rcount <= rcount - 8'd1; + end + endcase + end + +always_comb + if (rstate == R_IDLE && s_axi_arvalid) + raddr_63_3 = s_axi_araddr[63:3]; + else if (rstate == R_BUSY && s_axi_rready) + raddr_63_3 = raddr_63_3_r + 61'h1; + else + raddr_63_3 = raddr_63_3_r; + +always @ (posedge clk) + raddr_63_3_r <= raddr_63_3; + + + +// --------------------------------------------------------------------------------------- +// AXI WRITE state machine +// --------------------------------------------------------------------------------------- + +enum reg [1:0] {W_IDLE, W_BUSY, W_RESP} wstate = W_IDLE; + +reg [AXI_IDWIDTH-1:0] wid = '0; +reg [ 7:0] wcount = '0; +reg [ 63-3:0] waddr_63_3 = '0; +wire [ 63:0] waddr = {waddr_63_3, 3'h0}; + +assign s_axi_awready = (wstate == W_IDLE); +assign s_axi_wready = (wstate == W_BUSY); +assign s_axi_bvalid = (wstate == W_RESP); +assign s_axi_bid = wid; +assign s_axi_bresp = '0; + +always @ (posedge clk or negedge rstn) + if (~rstn) begin + wstate <= W_IDLE; + wid <= '0; + wcount <= '0; + waddr_63_3 <= '0; + end else begin + case (wstate) + W_IDLE : + if (s_axi_awvalid) begin + wstate <= W_BUSY; + wid <= s_axi_awid; + wcount <= s_axi_awlen; + waddr_63_3 <= s_axi_awaddr[63:3]; + end + W_BUSY : + if (s_axi_wvalid) begin + if (wcount == 8'd0 || s_axi_wlast) + wstate <= W_RESP; + wcount <= wcount - 8'd1; + waddr_63_3 <= waddr_63_3 + 61'h1; + end + W_RESP : + if (s_axi_bready) + wstate <= W_IDLE; + default : + wstate <= W_IDLE; + endcase + end + + + +function automatic logic [255:0] big_endian_to_little_endian_32B (input logic [255:0] din); + logic [255:0] dout; + for (int i=0; i<32; i++) + dout[i*8 +: 8] = din[(31-i)*8 +: 8]; + return dout; +endfunction + + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// signals of the MPEG2 encoder +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +reg mpeg2_rstn = '0; + +wire mpeg2_sequence_busy; +reg mpeg2_sequence_stop = '0; + +reg [ 6:0] mpeg2_xsize16 = '0; +reg [ 6:0] mpeg2_ysize16 = '0; + +reg mpeg2_i_en = '0; +reg [ 7:0] mpeg2_i_Y0, mpeg2_i_Y1, mpeg2_i_Y2, mpeg2_i_Y3; +reg [ 7:0] mpeg2_i_U0, mpeg2_i_U2 ; +reg [ 7:0] mpeg2_i_V0, mpeg2_i_V2 ; + +wire mpeg2_o_en; +wire mpeg2_o_last; +wire [255:0] mpeg2_o_data; + +reg [ 15:0] mpeg2_o_addr = '0; +reg mpeg2_o_over = '0; // 1: overflow ! + + +reg [255:0] out_buf ['h10000]; // out buffer : a block RAM to save the MPEG2 IP's out data, 32B * 0x10000 = 2 MB + +reg [255:0] out_buf_rdata; + +always @ (posedge clk) + out_buf_rdata <= out_buf[ (16)'(raddr>>5) ]; // read BRAM + +always @ (posedge clk) + if ( mpeg2_o_en & ~mpeg2_o_over ) + out_buf[mpeg2_o_addr] <= big_endian_to_little_endian_32B(mpeg2_o_data); + + +always_comb + if ( raddr_r == 64'h00000000 ) // address = 0x00000000 : read status register (reset status and sequence status) + s_axi_rdata = {61'h0, mpeg2_sequence_busy, 1'b0, mpeg2_rstn}; + else if ( raddr_r == 64'h00000008 ) // address = 0x00000008 : read status register (video frame size) + s_axi_rdata = {25'h0, mpeg2_ysize16, + 25'h0, mpeg2_xsize16 }; + else if ( raddr_r == 64'h00000010 ) // address = 0x00000010 : read status register (out buffer status) + s_axi_rdata = { 31'h0, mpeg2_o_over, + 11'h0, mpeg2_o_addr, 5'h0}; + else if ( raddr_r >= 64'h01000000 ) // address >= 0x01000000 : read out buffer (i.e. MPEG2 output stream) + case( raddr_r[4:3] ) + 2'b00 : s_axi_rdata = out_buf_rdata[ 63: 0]; + 2'b01 : s_axi_rdata = out_buf_rdata[127: 64]; + 2'b10 : s_axi_rdata = out_buf_rdata[191:128]; + 2'b11 : s_axi_rdata = out_buf_rdata[255:192]; + endcase + else + s_axi_rdata = '0; + + +always @ (posedge clk) begin + mpeg2_sequence_stop <= 1'b0; + mpeg2_i_en <= 1'b0; + + if ( mpeg2_o_en ) begin + if ( mpeg2_o_addr == '1 ) + mpeg2_o_over <= 1'b1; + else + mpeg2_o_addr <= mpeg2_o_addr + 16'h1; + end + + if ( s_axi_wvalid & s_axi_wready ) begin + if ( waddr == 64'h00000000 ) begin // address = 0x00000000 : write reset control register (reset control and sequence control) + mpeg2_rstn <= s_axi_wdata[0]; + mpeg2_sequence_stop <= s_axi_wdata[1]; + + end else if ( waddr == 64'h00000008 ) begin // address = 0x00000008 : write control register (video frame size) + mpeg2_xsize16 <= s_axi_wdata[ 6: 0]; + mpeg2_ysize16 <= s_axi_wdata[38:32]; + + end else if ( waddr == 64'h00000010 ) begin // address = 0x00000010 : write control register (out buffer control) + mpeg2_o_addr <= '0; + mpeg2_o_over <= '0; + + end else if ( waddr >= 64'h01000000 ) begin // address >= 0x01000000 : write video input data (i.e. raw YUV pixels) + mpeg2_i_en <= 1'b1; + { mpeg2_i_V2, mpeg2_i_Y3, mpeg2_i_U2, mpeg2_i_Y2, + mpeg2_i_V0, mpeg2_i_Y1, mpeg2_i_U0, mpeg2_i_Y0 } <= s_axi_wdata; + end + end +end + + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// MPEG2 encoder instance +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +mpeg2encoder #( + .XL ( 6 ), // determine the max horizontal pixel count. 4->256 pixels 5->512 pixels 6->1024 pixels 7->2048 pixels . + .YL ( 6 ), // determine the max vertical pixel count. 4->256 pixels 5->512 pixels 6->1024 pixels 7->2048 pixels . + .VECTOR_LEVEL ( 3 ), + .Q_LEVEL ( 2 ) +) mpeg2encoder_i ( + .rstn ( mpeg2_rstn ), + .clk ( clk ), + // Video sequence configuration interface. + .i_xsize16 ( mpeg2_xsize16 ), + .i_ysize16 ( mpeg2_ysize16 ), + .i_pframes_count ( 8'd47 ), + // Video sequence input pixel stream interface. In each clock cycle, this interface can input 4 adjacent pixels in a row. Pixel format is YUV 4:4:4, the module will convert it to YUV 4:2:0, then compress it to MPEG2 stream. + .i_en ( mpeg2_i_en ), + .i_Y0 ( mpeg2_i_Y0 ), + .i_Y1 ( mpeg2_i_Y1 ), + .i_Y2 ( mpeg2_i_Y2 ), + .i_Y3 ( mpeg2_i_Y3 ), + .i_U0 ( mpeg2_i_U0 ), + .i_U1 ( mpeg2_i_U0 ), + .i_U2 ( mpeg2_i_U2 ), + .i_U3 ( mpeg2_i_U2 ), + .i_V0 ( mpeg2_i_V0 ), + .i_V1 ( mpeg2_i_V0 ), + .i_V2 ( mpeg2_i_V2 ), + .i_V3 ( mpeg2_i_V2 ), + // Video sequence control interface. + .i_sequence_stop ( mpeg2_sequence_stop ), + .o_sequence_busy ( mpeg2_sequence_busy ), + // Video sequence output MPEG2 stream interface. + .o_en ( mpeg2_o_en ), + .o_last ( mpeg2_o_last ), + .o_data ( mpeg2_o_data ) +); + + + + +endmodule + + diff --git a/xdma/ch_3/fpga_top.sv b/xdma/ch_3/fpga_top.sv new file mode 100644 index 0000000..4e161a4 --- /dev/null +++ b/xdma/ch_3/fpga_top.sv @@ -0,0 +1,214 @@ + +module fpga_top ( + output wire o_led0, + input wire i_pcie_rstn, + input wire i_pcie_refclkp, i_pcie_refclkn, + input wire [0:0] i_pcie_rxp, i_pcie_rxn, + output wire [0:0] o_pcie_txp, o_pcie_txn +); + + +localparam AXI_IDWIDTH = 4; +localparam AXI_AWIDTH = 64; +localparam AXI_DWIDTH = 64; + + +wire pcie_rstn; +wire pcie_refclk; + +wire rstn; +wire clk; + + + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// PCIe XDMA's AXI interface +/////////////////////////////////////////////////////////////////////////////////////////////////////////////// +wire axi_awready; +wire axi_awvalid; +wire [ AXI_AWIDTH-1:0] axi_awaddr; +wire [ 7:0] axi_awlen; +wire [ AXI_IDWIDTH-1:0] axi_awid; +// AXI Master Write Data Channel +wire axi_wready; +wire axi_wvalid; +wire axi_wlast; +wire [ AXI_DWIDTH-1:0] axi_wdata; +// AXI Master Write Response Channel +wire axi_bready; +wire axi_bvalid; +wire [ AXI_IDWIDTH-1:0] axi_bid; +wire [ 1:0] axi_bresp; +// AXI Master Read Address Channel +wire axi_arready; +wire axi_arvalid; +wire [ AXI_AWIDTH-1:0] axi_araddr; +wire [ 7:0] axi_arlen; +wire [ AXI_IDWIDTH-1:0] axi_arid; +// AXI Master Read Data Channel +wire axi_rready; +wire axi_rvalid; +wire axi_rlast; +wire [ AXI_DWIDTH-1:0] axi_rdata; +wire [ AXI_IDWIDTH-1:0] axi_rid; +wire [ 1:0] axi_rresp; + + + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Ref clock input buffer +/////////////////////////////////////////////////////////////////////////////////////////////////////////////// +IBUFDS_GTE2 refclk_ibuf ( + .CEB ( 1'b0 ), + .I ( i_pcie_refclkp ), + .IB ( i_pcie_refclkn ), + .O ( pcie_refclk ), + .ODIV2 ( ) +); + + + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Reset input buffer +/////////////////////////////////////////////////////////////////////////////////////////////////////////////// +IBUF sys_reset_n_ibuf ( + .I ( i_pcie_rstn ), + .O ( pcie_rstn ) +); + + + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// PCIe XDMA core +/////////////////////////////////////////////////////////////////////////////////////////////////////////////// +xdma_0 xdma_0_i ( + // PCI Express (PCIe) Interface : connect to the pins of FPGA chip + .sys_rst_n ( pcie_rstn ), + .sys_clk ( pcie_refclk ), + .pci_exp_txn ( o_pcie_txn ), + .pci_exp_txp ( o_pcie_txp ), + .pci_exp_rxn ( i_pcie_rxn ), + .pci_exp_rxp ( i_pcie_rxp ), + // PCIe link up + .user_lnk_up ( o_led0 ), + // interrupts + .usr_irq_req ( 16'h0 ), + .usr_irq_ack ( ), + // + .msix_enable ( ), + // clock/reset for user (for AXI) + .axi_aclk ( clk ), + .axi_aresetn ( rstn ), + // AXI Interface + .m_axi_awready ( axi_awready ), + .m_axi_awvalid ( axi_awvalid ), + .m_axi_awaddr ( axi_awaddr ), + .m_axi_awlen ( axi_awlen ), + .m_axi_awid ( axi_awid ), + .m_axi_awsize ( ), + .m_axi_awburst ( ), + .m_axi_awprot ( ), + .m_axi_awlock ( ), + .m_axi_awcache ( ), + .m_axi_wready ( axi_wready ), + .m_axi_wvalid ( axi_wvalid ), + .m_axi_wlast ( axi_wlast ), + .m_axi_wdata ( axi_wdata ), + .m_axi_wstrb ( ), + .m_axi_bready ( axi_bready ), + .m_axi_bvalid ( axi_bvalid ), + .m_axi_bid ( axi_bid ), + .m_axi_bresp ( axi_bresp ), + .m_axi_arready ( axi_arready ), + .m_axi_arvalid ( axi_arvalid ), + .m_axi_araddr ( axi_araddr ), + .m_axi_arlen ( axi_arlen ), + .m_axi_arid ( axi_arid ), + .m_axi_arsize ( ), + .m_axi_arburst ( ), + .m_axi_arprot ( ), + .m_axi_arlock ( ), + .m_axi_arcache ( ), + .m_axi_rready ( axi_rready ), + .m_axi_rvalid ( axi_rvalid ), + .m_axi_rlast ( axi_rlast ), + .m_axi_rdata ( axi_rdata ), + .m_axi_rid ( axi_rid ), + .m_axi_rresp ( axi_rresp ), + // AXI bypass interface + .m_axib_awready ( '0 ), + .m_axib_awvalid ( ), + .m_axib_awaddr ( ), + .m_axib_awlen ( ), + .m_axib_awid ( ), + .m_axib_awsize ( ), + .m_axib_awburst ( ), + .m_axib_awprot ( ), + .m_axib_awlock ( ), + .m_axib_awcache ( ), + .m_axib_wready ( '0 ), + .m_axib_wvalid ( ), + .m_axib_wlast ( ), + .m_axib_wdata ( ), + .m_axib_wstrb ( ), + .m_axib_bready ( ), + .m_axib_bvalid ( '0 ), + .m_axib_bid ( '0 ), + .m_axib_bresp ( '0 ), + .m_axib_arready ( '0 ), + .m_axib_arvalid ( ), + .m_axib_araddr ( ), + .m_axib_arlen ( ), + .m_axib_arid ( ), + .m_axib_arsize ( ), + .m_axib_arburst ( ), + .m_axib_arprot ( ), + .m_axib_arlock ( ), + .m_axib_arcache ( ), + .m_axib_rready ( ), + .m_axib_rvalid ( '0 ), + .m_axib_rlast ( '0 ), + .m_axib_rdata ( '0 ), + .m_axib_rid ( '0 ), + .m_axib_rresp ( '0 ) +); + + + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// AXI BRAM connected to PCIe XDMA's AXI interface +/////////////////////////////////////////////////////////////////////////////////////////////////////////////// +axi_mpeg2encoder_wrapper #( + .AXI_IDWIDTH ( AXI_IDWIDTH ) +) axi_axi_mpeg2encoder_wrapper_i ( + .rstn ( rstn ), + .clk ( clk ), + // AXI Memory Mapped interface + .s_axi_awready ( axi_awready ), + .s_axi_awvalid ( axi_awvalid ), + .s_axi_awaddr ( axi_awaddr ), + .s_axi_awlen ( axi_awlen ), + .s_axi_awid ( axi_awid ), + .s_axi_wready ( axi_wready ), + .s_axi_wvalid ( axi_wvalid ), + .s_axi_wlast ( axi_wlast ), + .s_axi_wdata ( axi_wdata ), + .s_axi_bready ( axi_bready ), + .s_axi_bvalid ( axi_bvalid ), + .s_axi_bid ( axi_bid ), + .s_axi_bresp ( axi_bresp ), + .s_axi_arready ( axi_arready ), + .s_axi_arvalid ( axi_arvalid ), + .s_axi_araddr ( axi_araddr ), + .s_axi_arlen ( axi_arlen ), + .s_axi_arid ( axi_arid ), + .s_axi_rready ( axi_rready ), + .s_axi_rvalid ( axi_rvalid ), + .s_axi_rlast ( axi_rlast ), + .s_axi_rdata ( axi_rdata ), + .s_axi_rid ( axi_rid ), + .s_axi_rresp ( axi_rresp ) +); + + +endmodule diff --git a/xdma/ch_3/mpeg2encoder.sv b/xdma/ch_3/mpeg2encoder.sv new file mode 100644 index 0000000..4dd0d46 --- /dev/null +++ b/xdma/ch_3/mpeg2encoder.sv @@ -0,0 +1,2350 @@ + +module mpeg2encoder #( + parameter XL = 6, // determine the max horizontal pixel count. 4->256 pixels 5->512 pixels 6->1024 pixels 7->2048 pixels . + parameter YL = 6, // determine the max vertical pixel count. 4->256 pixels 5->512 pixels 6->1024 pixels 7->2048 pixels . + parameter VECTOR_LEVEL = 3, // motion vector range level, must be 1, 2, or 3. The larger the XL, the higher compression ratio, and the more LUT resource is uses. + parameter Q_LEVEL = 2 // quantize level, must be 1, 2, 3 or 4. The larger the Q_LEVEL, the higher compression ratio and the lower quality. +) ( + input wire rstn, // =0:async reset, =1:normal operation. It MUST be reset before starting to use. + input wire clk, + + // Video sequence configuration interface. -------------------------------------------------------------------------------------------------------------- + input wire [XL:0] i_xsize16, // horizontal pixel count = i_xsize16*16 . valid range: 4 ~ 2^XL + input wire [YL:0] i_ysize16, // vertical pixel count = i_ysize16*16 . valid range: 4 ~ 2^YL + input wire [ 7:0] i_pframes_count, // defines the number of P-frames between two I-frames. valid range: 0 ~ 255 + + // Video sequence input pixel stream interface. In each clock cycle, this interface can input 4 adjacent pixels in a row. Pixel format is YUV 4:4:4, the module will convert it to YUV 4:2:0, then compress it to MPEG2 stream. + input wire i_en, // when i_en=1, 4 adjacent pixels is being inputted, + input wire [ 7:0] i_Y0, i_Y1, i_Y2, i_Y3, // input Y (luminance) + input wire [ 7:0] i_U0, i_U1, i_U2, i_U3, // input U (Cb, chroma blue) + input wire [ 7:0] i_V0, i_V1, i_V2, i_V3, // input V (Cr, chroma red) + + // Video sequence control interface. -------------------------------------------------------------------------------------------------------------------- + input wire i_sequence_stop, // use this signal to stop a inputting video sequence + output wire o_sequence_busy, // =0: the module is idle and ready to encode the next sequence. =1: the module is busy encoding the current sequence + + // Video sequence output MPEG2 stream interface. -------------------------------------------------------------------------------------------------------- + output wire o_en, // o_en=1 indicates o_data is valid + output wire o_last, // o_en=1 & o_last=1 indicates this is the last data of a video sequence + output wire[255:0] o_data // output mpeg2 stream data, 32 bytes in BIG ENDIAN, i.e., o_data[255:248] is the 1st byte, o_data[247:0] is the 2nd byte, ... o_data[7:0] is the 32nd byte. +); + + + + +// +// Definition of nouns: +// tile : 8x8 pixels, the unit of DCT, quantize and zig-zag reorder +// block (blk) : contains 16x16 U pixels (4 tiles of Y, 1 tile of U, 1 tile of V) +// slice : a line of block (16 lines of pixels) +// +// Note : +// right shift: for signed number, use ">>>" rather than ">>". for unsigned number, using ">>>" and ">>" are both okay. +// + + + + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// local parameters : frame size +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +localparam XB16 = XL , YB16 = YL ; +localparam XB8 = XB16 + 1 , YB8 = YB16 + 1; +localparam XB4 = XB8 + 1 , YB4 = YB8 + 1; +localparam XB2 = XB4 + 1 , YB2 = YB4 + 1; +localparam XB = XB2 + 1 , YB = YB2 + 1; + +localparam XSIZE = (1 << XB); // horizontal max pixel count +localparam YSIZE = (1 << YB); // vertical max pixel count + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// local parameters : motion estimation +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +localparam int UR = VECTOR_LEVEL; // U/V motion vector range is in -YR~+YR pixels +localparam int YR = UR * 2; // Y motion vector range is in -YR~+YR pixels + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// local parameters : DCT +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +localparam DCTP = 0; + +localparam logic signed [7:0] DCT_MATRIX [8][8] = '{ + '{ 64, 64, 64, 64, 64, 64, 64, 64 }, + '{ 89, 75, 50, 18, -18, -50, -75, -89 }, + '{ 84, 35, -35, -84, -84, -35, 35, 84 }, + '{ 75, -18, -89, -50, 50, 89, 18, -75 }, + '{ 64, -64, -64, 64, 64, -64, -64, 64 }, + '{ 50, -89, 18, 75, -75, -18, 89, -50 }, + '{ 35, -84, 84, -35, -35, 84, -84, 35 }, + '{ 18, -50, 75, -89, 89, -75, 50, -18 } +}; + +/* +localparam DCTP = 2; + +localparam logic signed [9:0] DCT_MATRIX [8][8] = '{ + '{ 256, 256, 256, 256, 256, 256, 256, 256 }, + '{ 355, 301, 201, 71, -71, -201, -301, -355 }, + '{ 334, 139, -139, -334, -334, -139, 139, 334 }, + '{ 301, -71, -355, -201, 201, 355, 71, -301 }, + '{ 256, -256, -256, 256, 256, -256, -256, 256 }, + '{ 201, -355, 71, 301, -301, -71, 355, -201 }, + '{ 139, -334, 334, -139, -139, 334, -334, 139 }, + '{ 71, -201, 301, -355, 355, -301, 201, -71 } +};*/ + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// local parameters : quantize +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +localparam logic [6:0] INTRA_Q [8][8] = '{ + '{ 8, 16, 19, 22, 26, 27, 29, 34 }, + '{ 16, 16, 22, 24, 27, 29, 34, 37 }, + '{ 19, 22, 26, 27, 29, 34, 34, 38 }, + '{ 22, 22, 26, 27, 29, 34, 37, 40 }, + '{ 22, 26, 27, 29, 32, 35, 40, 48 }, + '{ 26, 27, 29, 32, 35, 40, 48, 58 }, + '{ 26, 27, 29, 34, 38, 46, 56, 69 }, + '{ 27, 29, 35, 38, 46, 56, 69, 83 } +}; + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// local parameters : zig-zag reorder +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +localparam logic [5:0] ZIG_ZAG_TABLE [8][8] = '{ + '{ 0, 1, 5, 6, 14, 15, 27, 28 }, + '{ 2, 4, 7, 13, 16, 26, 29, 42 }, + '{ 3, 8, 12, 17, 25, 30, 41, 43 }, + '{ 9, 11, 18, 24, 31, 40, 44, 53 }, + '{ 10, 19, 23, 32, 39, 45, 52, 54 }, + '{ 20, 22, 33, 38, 46, 51, 55, 60 }, + '{ 21, 34, 37, 47, 50, 56, 59, 61 }, + '{ 35, 36, 48, 49, 57, 58, 62, 63 } +}; + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// local parameters : inverse DCT +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +localparam logic signed [16:0] W1 = 17'sd2841; // 2048*sqrt(2)*cos(1*pi/16) +localparam logic signed [16:0] W2 = 17'sd2676; // 2048*sqrt(2)*cos(2*pi/16) +localparam logic signed [16:0] W3 = 17'sd2408; // 2048*sqrt(2)*cos(3*pi/16) +localparam logic signed [16:0] W5 = 17'sd1609; // 2048*sqrt(2)*cos(5*pi/16) +localparam logic signed [16:0] W6 = 17'sd1108; // 2048*sqrt(2)*cos(6*pi/16) +localparam logic signed [16:0] W7 = 17'sd565 ; // 2048*sqrt(2)*cos(7*pi/16) + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// local parameters : look-up-tables for variable length code (VLC) +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +localparam logic [4:0] BITS_MOTION_VECTOR [17] = '{5'h01, 5'h01, 5'h01, 5'h01, 5'h03, 5'h05, 5'h04, 5'h03, 5'h0b, 5'h0a, 5'h09, 5'h11, 5'h10, 5'h0f, 5'h0e, 5'h0d, 5'h0c}; +localparam logic [3:0] LENS_MOTION_VECTOR [17] = '{4'd01, 4'd02, 4'd03, 4'd04, 4'd06, 4'd07, 4'd07, 4'd07, 4'd09, 4'd09, 4'd09, 4'd10, 4'd10, 4'd10, 4'd10, 4'd10, 4'd10}; + +localparam logic [4:0] BITS_NZ_FLAGS [64] = '{5'h00, 5'h0b, 5'h09, 5'h0d, 5'h0d, 5'h17, 5'h13, 5'h1f, 5'h0c, 5'h16, 5'h12, 5'h1e, 5'h13, 5'h1b, 5'h17, 5'h13, 5'h0b, 5'h15, 5'h11, 5'h1d, 5'h11, 5'h19, 5'h15, 5'h11, 5'h0f, 5'h0f, 5'h0d, 5'h03, 5'h0f, 5'h0b, 5'h07, 5'h07, 5'h0a, 5'h14, 5'h10, 5'h1c, 5'h0e, 5'h0e, 5'h0c, 5'h02, 5'h10, 5'h18, 5'h14, 5'h10, 5'h0e, 5'h0a, 5'h06, 5'h06, 5'h12, 5'h1a, 5'h16, 5'h12, 5'h0d, 5'h09, 5'h05, 5'h05, 5'h0c, 5'h08, 5'h04, 5'h04, 5'h07, 5'h0a, 5'h08, 5'h0c}; +localparam logic [3:0] LENS_NZ_FLAGS [64] = '{4'd00, 4'd05, 4'd05, 4'd06, 4'd04, 4'd07, 4'd07, 4'd08, 4'd04, 4'd07, 4'd07, 4'd08, 4'd05, 4'd08, 4'd08, 4'd08, 4'd04, 4'd07, 4'd07, 4'd08, 4'd05, 4'd08, 4'd08, 4'd08, 4'd06, 4'd08, 4'd08, 4'd09, 4'd05, 4'd08, 4'd08, 4'd09, 4'd04, 4'd07, 4'd07, 4'd08, 4'd06, 4'd08, 4'd08, 4'd09, 4'd05, 4'd08, 4'd08, 4'd08, 4'd05, 4'd08, 4'd08, 4'd09, 4'd05, 4'd08, 4'd08, 4'd08, 4'd05, 4'd08, 4'd08, 4'd09, 4'd05, 4'd08, 4'd08, 4'd09, 4'd03, 4'd05, 4'd05, 4'd06}; + +localparam logic [8:0] BITS_DC_Y [12] = '{ 9'h004, 9'h000, 9'h001, 9'h005, 9'h006, 9'h00e, 9'h01e, 9'h03e, 9'h07e, 9'h0fe, 9'h1fe, 9'h1ff}; +localparam logic [3:0] LENS_DC_Y [12] = '{ 4'd003, 4'd002, 4'd002, 4'd003, 4'd003, 4'd004, 4'd005, 4'd006, 4'd007, 4'd008, 4'd009, 4'd009}; + +localparam logic [9:0] BITS_DC_UV [12] = '{10'h000, 10'h001, 10'h002, 10'h006, 10'h00e, 10'h01e, 10'h03e, 10'h07e, 10'h0fe, 10'h1fe, 10'h3fe, 10'h3ff}; +localparam logic [3:0] LENS_DC_UV [12] = '{ 4'd002, 4'd002, 4'd002, 4'd003, 4'd004, 4'd005, 4'd006, 4'd007, 4'd008, 4'd009, 4'd010, 4'd010}; + +localparam logic [5:0] BITS_AC_0_3 [4][40] = '{ + '{6'h03, 6'h04, 6'h05, 6'h06, 6'h26, 6'h21, 6'h0a, 6'h1d, 6'h18, 6'h13, 6'h10, 6'h1a, 6'h19, 6'h18, 6'h17, 6'h1f, 6'h1e, 6'h1d, 6'h1c, 6'h1b, 6'h1a, 6'h19, 6'h18, 6'h17, 6'h16, 6'h15, 6'h14, 6'h13, 6'h12, 6'h11, 6'h10, 6'h18, 6'h17, 6'h16, 6'h15, 6'h14, 6'h13, 6'h12, 6'h11, 6'h10}, // runlen=0 , absvm1<40 + '{6'h03, 6'h06, 6'h25, 6'h0c, 6'h1b, 6'h16, 6'h15, 6'h1f, 6'h1e, 6'h1d, 6'h1c, 6'h1b, 6'h1a, 6'h19, 6'h13, 6'h12, 6'h11, 6'h10, 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 }, // runlen=1 , absvm1<18 + '{6'h05, 6'h04, 6'h0b, 6'h14, 6'h14, 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 }, // runlen=2 , absvm1<5 + '{6'h07, 6'h24, 6'h1c, 6'h13, 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 , 6'h0 } // runlen=3 , absvm1<4 +}; + +localparam logic [4:0] LENS_AC_0_3 [4][40] = '{ + '{5'd02, 5'd04, 5'd05, 5'd07, 5'd08, 5'd08, 5'd10, 5'd12, 5'd12, 5'd12, 5'd12, 5'd13, 5'd13, 5'd13, 5'd13, 5'd14, 5'd14, 5'd14, 5'd14, 5'd14, 5'd14, 5'd14, 5'd14, 5'd14, 5'd14, 5'd14, 5'd14, 5'd14, 5'd14, 5'd14, 5'd14, 5'd15, 5'd15, 5'd15, 5'd15, 5'd15, 5'd15, 5'd15, 5'd15, 5'd15}, + '{5'd03, 5'd06, 5'd08, 5'd10, 5'd12, 5'd13, 5'd13, 5'd15, 5'd15, 5'd15, 5'd15, 5'd15, 5'd15, 5'd15, 5'd16, 5'd16, 5'd16, 5'd16, 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 }, + '{5'd04, 5'd07, 5'd10, 5'd12, 5'd13, 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 }, + '{5'd05, 5'd08, 5'd12, 5'd13, 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 , 5'd0 } +}; + +localparam logic [5:0] BITS_AC_4_31 [32][3] = '{ + '{6'h0 , 6'h0 , 6'h0 }, // runlen=0 , unused + '{6'h0 , 6'h0 , 6'h0 }, // runlen=1 , unused + '{6'h0 , 6'h0 , 6'h0 }, // runlen=2 , unused + '{6'h0 , 6'h0 , 6'h0 }, // runlen=3 , unused + '{6'h06, 6'h0f, 6'h12}, // runlen=4 , absvm1<3 + '{6'h07, 6'h09, 6'h12}, // runlen=5 , absvm1<3 + '{6'h05, 6'h1e, 6'h14}, // runlen=6 , absvm1<3 + '{6'h04, 6'h15, 6'h0 }, // runlen=7 , absvm1<2 + '{6'h07, 6'h11, 6'h0 }, // runlen=8 , absvm1<2 + '{6'h05, 6'h11, 6'h0 }, // runlen=9 , absvm1<2 + '{6'h27, 6'h10, 6'h0 }, // runlen=10, absvm1<2 + '{6'h23, 6'h1a, 6'h0 }, // runlen=11, absvm1<2 + '{6'h22, 6'h19, 6'h0 }, // runlen=12, absvm1<2 + '{6'h20, 6'h18, 6'h0 }, // runlen=13, absvm1<2 + '{6'h0e, 6'h17, 6'h0 }, // runlen=14, absvm1<2 + '{6'h0d, 6'h16, 6'h0 }, // runlen=15, absvm1<2 + '{6'h08, 6'h15, 6'h0 }, // runlen=16, absvm1<2 + '{6'h1f, 6'h0 , 6'h0 }, // runlen=17, absvm1<1 + '{6'h1a, 6'h0 , 6'h0 }, // runlen=18, absvm1<1 + '{6'h19, 6'h0 , 6'h0 }, // runlen=19, absvm1<1 + '{6'h17, 6'h0 , 6'h0 }, // runlen=20, absvm1<1 + '{6'h16, 6'h0 , 6'h0 }, // runlen=21, absvm1<1 + '{6'h1f, 6'h0 , 6'h0 }, // runlen=22, absvm1<1 + '{6'h1e, 6'h0 , 6'h0 }, // runlen=23, absvm1<1 + '{6'h1d, 6'h0 , 6'h0 }, // runlen=24, absvm1<1 + '{6'h1c, 6'h0 , 6'h0 }, // runlen=25, absvm1<1 + '{6'h1b, 6'h0 , 6'h0 }, // runlen=26, absvm1<1 + '{6'h1f, 6'h0 , 6'h0 }, // runlen=27, absvm1<1 + '{6'h1e, 6'h0 , 6'h0 }, // runlen=28, absvm1<1 + '{6'h1d, 6'h0 , 6'h0 }, // runlen=29, absvm1<1 + '{6'h1c, 6'h0 , 6'h0 }, // runlen=30, absvm1<1 + '{6'h1b, 6'h0 , 6'h0 } // runlen=31, absvm1<1 +}; + +localparam logic [4:0] LENS_AC_4_31 [32][3] = '{ + '{5'd0 , 5'd0 , 5'd0 }, + '{5'd0 , 5'd0 , 5'd0 }, + '{5'd0 , 5'd0 , 5'd0 }, + '{5'd0 , 5'd0 , 5'd0 }, + '{5'd05, 5'd10, 5'd12}, + '{5'd06, 5'd10, 5'd13}, + '{5'd06, 5'd12, 5'd16}, + '{5'd06, 5'd12, 5'd0 }, + '{5'd07, 5'd12, 5'd0 }, + '{5'd07, 5'd13, 5'd0 }, + '{5'd08, 5'd13, 5'd0 }, + '{5'd08, 5'd16, 5'd0 }, + '{5'd08, 5'd16, 5'd0 }, + '{5'd08, 5'd16, 5'd0 }, + '{5'd10, 5'd16, 5'd0 }, + '{5'd10, 5'd16, 5'd0 }, + '{5'd10, 5'd16, 5'd0 }, + '{5'd12, 5'd0 , 5'd0 }, + '{5'd12, 5'd0 , 5'd0 }, + '{5'd12, 5'd0 , 5'd0 }, + '{5'd12, 5'd0 , 5'd0 }, + '{5'd12, 5'd0 , 5'd0 }, + '{5'd13, 5'd0 , 5'd0 }, + '{5'd13, 5'd0 , 5'd0 }, + '{5'd13, 5'd0 , 5'd0 }, + '{5'd13, 5'd0 , 5'd0 }, + '{5'd13, 5'd0 , 5'd0 }, + '{5'd16, 5'd0 , 5'd0 }, + '{5'd16, 5'd0 , 5'd0 }, + '{5'd16, 5'd0 , 5'd0 }, + '{5'd16, 5'd0 , 5'd0 }, + '{5'd16, 5'd0 , 5'd0 } +}; + + + + + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// functions +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +function automatic logic [7:0] mean2 (input logic [7:0] a, input logic [7:0] b); + return (8)'( ( 9'd1 + (9)'(a) + (9)'(b) ) >> 1 ) ; +endfunction + + +function automatic logic [7:0] mean4 (input logic [7:0] a, input logic [7:0] b, input logic [7:0] c, input logic [7:0] d); + return (8)'( ( 10'd1 + (10)'(a) + (10)'(b) + (10)'(c) + (10)'(d) ) >> 2 ) ; +endfunction + + +function automatic logic [7:0] func_diff (input logic [7:0] a, input logic [7:0] b); + return (a>b) ? (a-b) : (b-a); +endfunction + + +function automatic logic signed [8:0] clip_neg255_pos255(input logic signed [27:0] x); + return (x < -28'sd255) ? -9'sd255 : (x > 28'sd255) ? 9'sd255 : (9)'(x) ; +endfunction + + +function automatic logic [7:0] add_clip_0_255 (input logic [7:0] a, input logic signed [8:0] b); + logic [9:0] c = b; + c += $signed( (10)'(a) ); + return (c > 10'sd255) ? 8'd255 : (c < 10'sd0) ? 8'd0 : (8)'( $unsigned(c) ) ; +endfunction + + +function automatic logic [3:0] find_min_in_10_values (input logic [12:0] v0, input logic [12:0] v1, input logic [12:0] v2, input logic [12:0] v3, input logic [12:0] v4, input logic [12:0] v5, input logic [12:0] v6, input logic [12:0] v7, input logic [12:0] v8, input logic [12:0] v9 ); + logic wi1, wi3, wi5, wi7, wi9; + logic [12:0] w01, w23, w45, w67, w89; + logic xi23, xi67; + logic [12:0] x0123, x4567; + wi1 = v1 < v0; + w01 = wi1 ? v1 : v0; + wi3 = v3 < v2; + w23 = wi3 ? v3 : v2; + wi5 = v5 < v4; + w45 = wi5 ? v5 : v4; + wi7 = v7 < v6; + w67 = wi7 ? v7 : v6; + wi9 = v9 < v8; + w89 = wi9 ? v9 : v8; + xi23 = w23 < w01; + x0123 = xi23 ? w23 : w01; + xi67 = w67 < w45; + x4567 = xi67 ? w67 : w45; + if( w89 <= x0123 && w89 <= x4567) begin + return {3'b100, wi9}; + end else if(x0123 < x4567) begin + if( xi23 ) + return {3'b001, wi3}; + else + return {3'b000, wi1}; + end else begin + if( xi67 ) + return {3'b011, wi7}; + else + return {3'b010, wi5}; + end +endfunction + + +// inverse two dimensional DCT (Chen-Wang algorithm) stage 1: right multiply a matrix, act on each rows +function automatic logic [32*9-1:0] invserse_dct_rows_step12 (input logic signed [12:0] a0, input logic signed [12:0] a1, input logic signed [12:0] a2, input logic signed [12:0] a3, input logic signed [12:0] a4, input logic signed [12:0] a5, input logic signed [12:0] a6, input logic signed [12:0] a7 ); + logic signed [31:0] x0, x1, x2, x3, x4, x5, x6, x7, x8; + x0 = a0; + x1 = a4; + x2 = a6; + x3 = a2; + x4 = a1; + x5 = a7; + x6 = a5; + x7 = a3; + x0 <<= 11; + x1 <<= 11; + x0[7] = 1'b1; // x0 += 128 , for proper rounding in the fourth stage + // step 1 ---------------------------------------------------------------------------------- + x8 = W7 * (x4+x5); + x4 = x8 + (W1-W7) * x4; + x5 = x8 - (W1+W7) * x5; + x8 = W3 * (x6+x7); + x6 = x8 - (W3-W5) * x6; + x7 = x8 - (W3+W5) * x7; + // step 2 ---------------------------------------------------------------------------------- + x8 = x0 + x1; + x0 -= x1; + x1 = W6 * (x3+x2); + x2 = x1 - (W2+W6) * x2; + x3 = x1 + (W2-W6) * x3; + x1 = x4 + x6; + x4 -= x6; + x6 = x5 + x7; + x5 -= x7; + return {x0, x1, x2, x3, x4, x5, x6, x7, x8}; +endfunction + +function automatic logic [18*8-1:0] invserse_dct_rows_step34 (logic [32*9-1:0] x0_to_x8); + logic signed [31:0] x0, x1, x2, x3, x4, x5, x6, x7, x8; + {x0, x1, x2, x3, x4, x5, x6, x7, x8} = x0_to_x8; + // step 3 ---------------------------------------------------------------------------------- + x7 = x8 + x3; + x8 -= x3; + x3 = x0 + x2; + x0 -= x2; + x2 = (32'sd181 * (x4+x5) + 32'sd128) >>> 8; + x4 = (32'sd181 * (x4-x5) + 32'sd128) >>> 8; + // step 4 ---------------------------------------------------------------------------------- + return { (18)'( (x7 + x1) >>> 8 ), + (18)'( (x3 + x2) >>> 8 ), + (18)'( (x0 + x4) >>> 8 ), + (18)'( (x8 + x6) >>> 8 ), + (18)'( (x8 - x6) >>> 8 ), + (18)'( (x0 - x4) >>> 8 ), + (18)'( (x3 - x2) >>> 8 ), + (18)'( (x7 - x1) >>> 8 ) }; +endfunction + + +// inverse two dimensional DCT (Chen-Wang algorithm) stage 2: left multiply a matrix, act on each columns +function automatic logic [32*9-1:0] invserse_dct_cols_step12 (input logic signed [17:0] a0, input logic signed [17:0] a1, input logic signed [17:0] a2, input logic signed [17:0] a3, input logic signed [17:0] a4, input logic signed [17:0] a5, input logic signed [17:0] a6, input logic signed [17:0] a7 ); + logic signed [31:0] x0, x1, x2, x3, x4, x5, x6, x7, x8; + x0 = a0; + x1 = a4; + x2 = a6; + x3 = a2; + x4 = a1; + x5 = a7; + x6 = a5; + x7 = a3; + x0 <<= 8; + x1 <<= 8; + x0 += 32'sd8192; + // step 1 ---------------------------------------------------------------------------------- + x8 = W7 * (x4+x5) + 32'sd4; + x4 = (x8 + (W1-W7) * x4) >>> 3; + x5 = (x8 - (W1+W7) * x5) >>> 3; + x8 = W3 * (x6+x7) + 32'sd4; + x6 = (x8 - (W3-W5) * x6) >>> 3; + x7 = (x8 - (W3+W5) * x7) >>>3; + // step 2 ---------------------------------------------------------------------------------- + x8 = x0 + x1; + x0 -= x1; + x1 = W6 * (x3+x2) + 32'sd4; + x2 = (x1 - (W2+W6) * x2) >>> 3; + x3 = (x1 + (W2-W6) * x3) >>> 3; + x1 = x4 + x6; + x4 -= x6; + x6 = x5 + x7; + x5 -= x7; + return {x0, x1, x2, x3, x4, x5, x6, x7, x8}; +endfunction + +function automatic logic [9*8-1:0] invserse_dct_cols_step34(input logic [32*9-1:0] x0_to_x8); + logic signed [31:0] x0, x1, x2, x3, x4, x5, x6, x7, x8; + {x0, x1, x2, x3, x4, x5, x6, x7, x8} = x0_to_x8; + // step 3 ---------------------------------------------------------------------------------- + x7 = x8 + x3; + x8 -= x3; + x3 = x0 + x2; + x0 -= x2; + x2 = (32'sd181 * (x4+x5) + 32'sd128) >>> 8; + x4 = (32'sd181 * (x4-x5) + 32'sd128) >>> 8; + // step 4 ---------------------------------------------------------------------------------- + return { clip_neg255_pos255( (x7+x1) >>> 14 ), + clip_neg255_pos255( (x3+x2) >>> 14 ), + clip_neg255_pos255( (x0+x4) >>> 14 ), + clip_neg255_pos255( (x8+x6) >>> 14 ), + clip_neg255_pos255( (x8-x6) >>> 14 ), + clip_neg255_pos255( (x0-x4) >>> 14 ), + clip_neg255_pos255( (x3-x2) >>> 14 ), + clip_neg255_pos255( (x7-x1) >>> 14 ) }; +endfunction + + + + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// stage A : overall control, horizontal U/V subsample +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// overall configuration variables +reg [ 7:0] pframes_count; + +wire [ XB16-1:0] i_max_x16 = ( i_xsize16 > (XL+1)'(1< (YL+1)'(1<> 2); + e_y16 <= d_y16; + e_start_blk <= (c_flip != d_flop) && d_x4[1:0] == 2'd0 && d_y_16 == 4'd0; // start of a block (16x16 Y) + e_en_blk <= (c_flip != d_flop) && d_x4[1:0] == 2'd3 && d_y_16 == 4'd15; // end of a block (16x16 Y) + e_Y_en <= (c_flip != d_flop); + e_UV_en <= (c_flip != d_flop) && d_y_16[0]; + end + +// shift the double-buffer's output to get a new block +reg [ 7:0] e_Y_blk [16][16]; +reg [ 7:0] e_U_blk [ 8][ 8]; +reg [ 7:0] e_V_blk [ 8][ 8]; + +always @ (*) begin + {e_Y_blk[15][12], e_Y_blk[15][13], e_Y_blk[15][14], e_Y_blk[15][15]} = e_Y_rd; + {e_U_blk[7][6], e_U_blk[7][7]} = e_U_rd; + {e_V_blk[7][6], e_V_blk[7][7]} = e_V_rd; +end + +always @ (posedge clk) begin + if( e_Y_en ) begin // shift to save a block of Y (16x16 Y) + for (int x=0; x<16; x++) + for(int y=0; y<15; y++) + e_Y_blk[y][x] <= e_Y_blk[y+1][x]; + for(int x=0; x<12; x++) + e_Y_blk[15][x] <= e_Y_blk[0][x+4]; + end + if( e_UV_en ) begin // shift to save a block of U/V (8x8 U and 8x8 V) + for (int x=0; x<8; x++) + for(int y=0; y<7; y++) begin + e_U_blk[y][x] <= e_U_blk[y+1][x]; + e_V_blk[y][x] <= e_V_blk[y+1][x]; + end + for(int x=0; x<6; x++) begin + e_U_blk[7][x] <= e_U_blk[0][x+2]; + e_V_blk[7][x] <= e_V_blk[0][x+2]; + end + end +end + + + + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// stage X & Y & Z : read reference frame memory +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +reg [ 8*8-1:0] mem_ref_Y [ (YSIZE ) * (XSIZE/8 ) ]; // Y reference frame memory : (YSIZE ) rows, XSIZE/8 cols , each item contains 8 Y pixels +reg [ 8*8-1:0] mem_ref_UV [ (YSIZE/2) * (XSIZE/16) * 2 ]; // U/V reference frame memory : (YSIZE/2) rows, XSIZE/16 cols, 2 channels (U/V), each item contains 8 U or V pixels + +reg [ 4:0] x_cnt; +reg [ XB16-1:0] x_x16; +logic [ YB16-1:0] x_y16; // temporary variable, not real register +reg x_x8_2; +reg [ YB-1:0] x_y; + +reg y_Y_en; +reg y_U_en; +reg y_V_en; + +reg [ 8*8-1:0] y_Y_rd; +reg [ 8*8-1:0] y_UV_rd; + +reg z_Y_en; +reg z_U_en; +reg z_V_en; + +reg [ 8*8-1:0] z_Y_rd; +reg [ 8*8-1:0] z_UV_rd; + +always @ (posedge clk or negedge rstn) + if (~rstn) begin + y_Y_en <= 1'b0; + y_U_en <= 1'b0; + y_V_en <= 1'b0; + x_x16 <= '0; + x_y <= '0; + x_x8_2 <= '0; + x_cnt <= '1; + end else begin // reference frame read control : + y_Y_en <= 1'b0; + y_U_en <= 1'b0; + y_V_en <= 1'b0; + if(e_start_blk) begin // when start to read a current block, start to read the reference blocks (whose position is at the right side of the current block) + if ( e_y16 == max_y16 && e_x16 == max_x16 ) begin // current block is at the bottom-right corner of the current image + x_x16 <= (XB16)'(0); // the reference block to read is at the top-left corner of reference image + x_y16 = (YB16)'(0); + end else if ( e_x16 == max_x16 ) begin // current block is the right-most block of the current image + x_x16 <= (XB16)'(0); // the reference block to read is the left-most block in the next row + x_y16 = e_y16 + (YB16)'(1); + end else begin // current block is NOT the right-most block of the current image + x_x16 <= e_x16 + (XB16)'(1); // the reference block to read is at the right side of the current block + x_y16 = e_y16; + end + x_y <= ((YB)'(x_y16) << 4) - (YB)'(YR); + x_x8_2 <= '0; + x_cnt <= '0; + end else if( x_cnt < (5)'(16+2*YR) ) begin // for each block, need to read YR+16+YR lines of Y + if(x_x8_2) begin + x_cnt <= x_cnt + 5'd1; + x_y <= x_y + (YB)'(1); + end + x_x8_2 <= ~x_x8_2; + y_Y_en <= 1'b1; + y_U_en <= ~x_y[0] & ~x_x8_2; + y_V_en <= ~x_y[0] & x_x8_2; + end + end + +always @ (posedge clk or negedge rstn) + if (~rstn) begin + z_Y_en <= '0; + z_U_en <= '0; + z_V_en <= '0; + end else begin + z_Y_en <= y_Y_en; + z_U_en <= y_U_en; + z_V_en <= y_V_en; + end + +always @ (posedge clk) begin + y_Y_rd <= mem_ref_Y [ {x_y , x_x16, x_x8_2} ] ; + y_UV_rd <= mem_ref_UV[ {x_y[YB-1:1], x_x16, x_x8_2} ] ; +end + +always @ (posedge clk) begin + z_Y_rd <= y_Y_rd; + z_UV_rd <= y_UV_rd; +end + +reg [7:0] z_Y_ref [-YR:16+YR-1] [16]; +reg [7:0] z_U_ref [-UR: 8+UR-1] [ 8]; +reg [7:0] z_V_ref [-UR: 8+UR-1] [ 8]; + +always @ (posedge clk) begin + if(z_Y_en) begin + for (int x=0; x<8; x++) begin + for (int y=-YR; y<16+YR; y++) + z_Y_ref[y][x] <= z_Y_ref[y][x+8]; + for (int y=-YR; y<16+YR-1; y++) + z_Y_ref[y][x+8] <= z_Y_ref[y+1][x]; + z_Y_ref[16+YR-1][x+8] <= z_Y_rd[x*8+:8]; // push the new data to the last item of z_Y_ref + end + end + if(z_U_en) begin + for (int x=0; x<8; x++) begin + for (int y=-UR; y<8+UR-1; y++) + z_U_ref[y][x] <= z_U_ref[y+1][x]; // shift z_U_ref + z_U_ref[8+UR-1][x] <= z_UV_rd[x*8+:8]; // push the new data to the last item of z_U_ref + end + end + if(z_V_en) begin + for (int x=0; x<8; x++) begin + for (int y=-UR; y<8+UR-1; y++) + z_V_ref[y][x] <= z_V_ref[y+1][x]; // shift z_V_ref + z_V_ref[8+UR-1][x] <= z_UV_rd[x*8+:8]; // push the new data to the last item of z_V_ref + end + end +end + + + + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// stage F : motion estimation +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +reg [ 7:0] f_i_frame; +reg [ XB16-1:0] f_x16 ; +reg [ YB16-1:0] f_y16 ; + +reg [ 15:0] f_Y_sum ; +reg [ 7:0] f_Y_mean; + +reg [ 7:0] f_Y_blk [16][16]; // Y current block +reg [ 7:0] f_U_blk [ 8][ 8]; // U current block +reg [ 7:0] f_V_blk [ 8][ 8]; // V current block + +reg [ 7:0] f_Y_ref [-YR:16+YR-1][-YR:16+16-1]; // Y reference +reg [ 7:0] f_U_ref [-UR: 8+UR-1][-UR:8+8-1]; // U reference +reg [ 7:0] f_V_ref [-UR: 8+UR-1][-UR:8+8-1]; // V reference + +reg [ 7:0] f_Y_prd [16][16]; // Y predicted block +reg [ 7:0] f_U_prd [-UR:8+UR-1][-UR:8+UR-1]; // U predicted block +reg [ 7:0] f_V_prd [-UR:8+UR-1][-UR:8+UR-1]; // V predicted block + +reg [ 7:0] f_Y_tmp [-YR:16+YR-1][-YR:16+YR-1]; // Y temporary reference map for full pixel search +reg [ 7:0] f_Y_hlf [-1:31][-1:31]; // Y temporary reference map for half pixel search + +reg [ 11:0] f_diff [-YR:YR][-YR:YR]; // up: YR, middle: 1, down: YR. left: YR, middle: 1, right: YR. +reg f_over [-YR:YR][-YR:YR]; // + +reg signed [ 1:0] f_mvxh , f_mvyh; // -1, 0, +1 +reg signed [ 4:0] f_mvx , f_mvy; // +reg f_inter ; + +reg f_en_blk ; + +reg [ 3:0] f_cnt ; // 0~15 + +enum reg [ 3:0] { + MV_IDLE , PREPARE_SEARCH_FULL, CALC_DIFF , CALC_MIN , + CALC_MOTION_VECTOR_Y , CALC_MOTION_VECTOR_X , + REF_SHIFT_Y , REF_SHIFT_X , + PREPARE_SEARCH_HALF , CALC_DIFF_HALF , + CALC_MIN_HALF1 , CALC_MIN_HALF2 , + REF_UV_SHIFT_Y , REF_UV_SHIFT_X , PREDICT +} f_stat ; + +logic [ 11:0] diff ; // temporary variable, not real register +logic tmpbit1, tmpbit2 ; // temporary variable, not real register + + +always @ (posedge clk or negedge rstn) + if (~rstn) begin + f_en_blk <= 1'b0; + f_cnt <= '0; + f_stat <= MV_IDLE; + end else begin + f_en_blk <= 1'b0; + f_cnt <= '0; + + case (f_stat) + MV_IDLE : begin + if(e_en_blk) + f_stat <= PREPARE_SEARCH_FULL; + end + + PREPARE_SEARCH_FULL : + f_stat <= CALC_DIFF; + + CALC_DIFF : begin + if(f_cnt < 4'd15) + f_cnt <= f_cnt + 4'd1; + else + f_stat <= CALC_MIN; + end + + CALC_MIN : begin + if( f_cnt < 4'd5 ) + f_cnt <= f_cnt + 4'd1; + else + f_stat <= CALC_MOTION_VECTOR_Y; + end + + CALC_MOTION_VECTOR_Y : + f_stat <= CALC_MOTION_VECTOR_X; + + CALC_MOTION_VECTOR_X : + f_stat <= REF_SHIFT_Y; + + REF_SHIFT_Y : begin + if(f_cnt < (4)'(YR-1) ) + f_cnt <= f_cnt + 4'd1; + else + f_stat <= REF_SHIFT_X; + end + + REF_SHIFT_X : begin + if(f_cnt < (4)'(YR-1) ) + f_cnt <= f_cnt + 4'd1; + else + f_stat <= PREPARE_SEARCH_HALF; + end + + PREPARE_SEARCH_HALF : + f_stat <= CALC_DIFF_HALF; + + CALC_DIFF_HALF : begin + if(f_cnt < 4'd15) + f_cnt <= f_cnt + 4'd1; + else + f_stat <= CALC_MIN_HALF1; + end + + CALC_MIN_HALF1 : + f_stat <= CALC_MIN_HALF2; + + CALC_MIN_HALF2 : + f_stat <= REF_UV_SHIFT_Y; + + REF_UV_SHIFT_Y : begin + if(f_cnt < 4'd2) + f_cnt <= f_cnt + 4'd1; + else + f_stat <= REF_UV_SHIFT_X; + end + + REF_UV_SHIFT_X : begin + if(f_cnt < 4'd2) + f_cnt <= f_cnt + 4'd1; + else + f_stat <= PREDICT; + end + + PREDICT : begin + f_stat <= MV_IDLE; + f_en_blk <= 1'b1; + end + endcase + end + + +always @ (posedge clk) + case(f_stat) + + // state: start, load current block and its reference -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + MV_IDLE : begin + if(e_en_blk) begin + f_i_frame <= e_i_frame; + f_y16 <= e_y16; + f_x16 <= e_x16; + end + + f_Y_sum <= '0; + + for (int y=0; y<16; y++) + for (int x=0; x<16; x++) + f_Y_blk[y][x] <= e_Y_blk[y][x]; // load current Y block + + for (int y=0; y<8; y++) + for (int x=0; x<8; x++) begin + f_U_blk[y][x] <= e_U_blk[y][x]; // load current U block + f_V_blk[y][x] <= e_V_blk[y][x]; // load current V block + end + + if(e_en_blk) begin + for (int y=-YR; y<16+YR; y++) begin + for (int x=-YR; x<16; x++) + f_Y_ref[y][x] <= f_Y_ref[y][x+16]; // left shift old Y reference 16 steps + for (int x=0; x<16; x++) + f_Y_ref[y][x+16] <= z_Y_ref[y][x]; // load new Y reference + end + + for (int y=-UR; y<8+UR; y++) begin + for (int x=-UR; x<8 ; x++) begin + f_U_ref[y][x] <= f_U_ref[y][x+8]; // left shift old U reference by 8 steps + f_V_ref[y][x] <= f_V_ref[y][x+8]; // left shift old V reference by 8 steps + end + for (int x=0; x<8; x++) begin + f_U_ref[y][x+8] <= z_U_ref[y][x]; // load new U reference + f_V_ref[y][x+8] <= z_V_ref[y][x]; // load new V reference + end + end + end + end + + // state: YR cycles -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + PREPARE_SEARCH_FULL : begin + for (int y=-YR; y<16+YR; y++) + for (int x=-YR; x<16+YR; x++) + f_Y_tmp[y][x] <= f_Y_ref[y][x]; // load f_Y_tmp from f_Y_ref : prepare for REF_SHIFT_Y + + for (int y=-YR; y<=YR; y++) + for (int x=-YR; x<=YR; x++) begin + f_diff[y][x] <= '0; // clear diff map + f_over[y][x] <=( (f_x16 == '0 && x<0 ) || // for left-most block, disable the motion-vector that mvx<0, + (f_x16 == max_x16 && x>0 ) || // for right-most block, disable the motion-vector that mvx>0, + (f_y16 == '0 && y<0 ) || // for top-most block, disable the motion-vector that mvy<0, + (f_y16 == max_y16 && y>0 ) ); // for bottom-most block, disable the motion-vector that mvy>0. + end + end + + // state: 16 cycles -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + CALC_DIFF : begin + for (int y=0; y<16; y++) + for (int x=0; x<16; x++) + f_Y_blk[y][x] <= f_Y_blk[y][(x+1)%16]; // cyclic left shift f_Y_blk by 1 step + + for (int y=-YR; y<16+YR ; y++) + for (int x=-YR; x<16+YR-1; x++) + f_Y_tmp[y][x] <= f_Y_tmp[y][x+1]; // left shift f_Y_tmp by 1 step + + diff = '0; + for(int y=0; y<16; y++) + diff += (12)'( f_Y_blk[y][0] ); + f_Y_sum <= f_Y_sum + (16)'(diff); // calculate sum of f_Y_blk + + for (int y=-YR; y<=YR; y++) + for (int x=-YR; x<=YR; x++) begin + diff = '0; + for (int yt=0; yt<16; yt++) + diff += (12)'( func_diff( f_Y_blk[yt][0] , f_Y_tmp[yt+y][x] ) ); + if( ~f_over[y][x] ) + {f_over[y][x], f_diff[y][x]} <= (13)'(f_diff[y][x]) + (13)'(diff) ; + end + end + + // state: 6 cycles -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + CALC_MIN : begin + tmpbit1 = 1'b1; + for (int y=-YR; y<=YR; y++) + for (int x=-YR; x<=YR; x++) + tmpbit1 &= f_over[y][x] | f_diff[y][x][11] ; + + tmpbit2 = 1'b1; + for (int y=-YR; y<=YR; y++) + for (int x=-YR; x<=YR; x++) + tmpbit2 &= f_over[y][x] | (f_diff[y][x][11] & ~tmpbit1) | f_diff[y][x][10] ; + + for (int y=-YR; y<=YR; y++) + for (int x=-YR; x<=YR; x++) begin + f_over[y][x] <= f_over[y][x] | (f_diff[y][x][11] & ~tmpbit1) | (f_diff[y][x][10] & ~tmpbit2); + f_diff[y][x] <= f_diff[y][x] << 2 ; + end + end + + // state: 1 cycle -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + CALC_MOTION_VECTOR_Y : begin + f_mvy <= '0; + for (int y=-YR; y<=YR; y++) begin + tmpbit1 = 1'b1; + for (int x=-YR; x<=YR; x++) + tmpbit1 &= f_over[y][x] ; + if( ~tmpbit1 ) + f_mvy <= (5)'(y); // use f_over to get the y of motion vector's x + end + end + + // state: 1 cycle -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + CALC_MOTION_VECTOR_X : begin + f_mvx <= '0; + for (int x=-YR; x<=YR; x++) + if( ~f_over[f_mvy][x] ) + f_mvx <= (5)'(x); // use f_over to get the x of motion vector's x + + for (int y=-YR; y<16+YR; y++) + for (int x=-YR; x<16+YR; x++) + f_Y_tmp[y][x] <= f_Y_ref[y][x]; // load f_Y_tmp from f_Y_ref : prepare for REF_SHIFT_Y + end + + + // state: YR cycles -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + REF_SHIFT_Y : begin + if ( f_mvy > 5'sd0 && (5)'(f_cnt) < $unsigned( f_mvy) ) // up shift Y + for (int y=0 ; y<16+YR; y++) // needn't to shift the pixels of y<-1, since they are discarded + for (int x=-YR; x<16+YR; x++) + f_Y_tmp[y-1][x] <= f_Y_tmp[y][x] ; + else if ( f_mvy < 5'sd0 && (5)'(f_cnt) < $unsigned(-f_mvy) ) // down shift Y + for (int y=-YR; y<16 ; y++) // needn't to shift the pixels of y>16, since they are discarded + for (int x=-YR; x<16+YR; x++) + f_Y_tmp[y+1][x] <= f_Y_tmp[y][x] ; + end + + // state: YR cycles -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + REF_SHIFT_X : begin + if ( f_mvx > 5'sd0 && (5)'(f_cnt) < $unsigned( f_mvx) ) // left shift Y + for (int y=-1; y<=16 ; y++) // needn't to shift the pixels of y<-1 and y>16, since they are discarded + for (int x=0 ; x<16+YR; x++) // needn't to shift the pixels of x<-1, since they are discarded + f_Y_tmp[y][x-1] <= f_Y_tmp[y][x] ; + else if ( f_mvx < 5'sd0 && (5)'(f_cnt) < $unsigned(-f_mvx) ) // right shift Y + for (int y=-1; y<=16; y++) // needn't to shift the pixels of y<-1 and y>16, since they are discarded + for (int x=-YR; x<16; x++) // needn't to shift the pixels of x>16, since they are discarded + f_Y_tmp[y][x+1] <= f_Y_tmp[y][x] ; + end + + // state: 1 cycle -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + PREPARE_SEARCH_HALF : begin + f_Y_mean <= f_Y_sum[15:8]; + + for (int y=-1; y<16; y++) + for(int x=-1; x<16; x++) begin + if(-20 ) || + ( (f_y16 == '0 || f_mvy == (5)'(-YR) ) && y<0 ) || + ( (f_y16 == max_y16 || f_mvy == (5)'( YR) ) && y>0 ) ); + end + end + + // state: 16 cycles -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + CALC_DIFF_HALF : begin + for (int y=0; y<16; y++) + for(int x=0; x<16; x++) + f_Y_blk[y][x] <= f_Y_blk[y][(x+1)%16]; // cyclic left shift f_Y_blk by 1 step + + for (int y=-1; y<32; y++) + for(int x=-1; x<30; x++) + f_Y_hlf[y][x] <= f_Y_hlf[y][x+2]; // left shift f_Y_hlf by 2 steps + + diff = '0; + for(int y=0; y<16; y++) + diff += (12)'( func_diff( f_Y_blk[y][0] , f_Y_mean ) ); + f_Y_sum <= f_Y_sum + (16)'(diff); // calculate diff of f_Y_blk and f_Y_mean + + for (int y=-1; y<=1; y++) + for (int x=-1; x<=1; x++) begin + diff = '0; + for (int yt=0; yt<16; yt++) + diff += (12)'( func_diff( f_Y_blk[yt][0] , f_Y_hlf[y+2*yt][x] ) ); + if( ~f_over[y][x] ) + {f_over[y][x], f_diff[y][x]} <= (13)'(f_diff[y][x]) + (13)'(diff) ; + end + end + + // state: 1 cycle -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + CALC_MIN_HALF1 : begin + diff = (f_Y_sum[15:12] == '0) ? f_Y_sum[11:0] : 12'hfff; + + // find min value in f_diff (a faster way) + case( find_min_in_10_values( + { f_over[-1][-1], f_diff[-1][-1] }, + { f_over[-1][ 0], f_diff[-1][ 0] }, + { f_over[-1][ 1], f_diff[-1][ 1] }, + { f_over[ 0][-1], f_diff[ 0][-1] }, + { f_over[ 0][ 0], f_diff[ 0][ 0] }, + { f_over[ 0][ 1], f_diff[ 0][ 1] }, + { f_over[ 1][-1], f_diff[ 1][-1] }, + { f_over[ 1][ 0], f_diff[ 1][ 0] }, + { f_over[ 1][ 1], f_diff[ 1][ 1] }, + { 1'b0, diff } ) ) + 4'd0 : begin f_mvyh <= -2'sd1; f_mvxh <= -2'sd1; f_inter <= 1'b1; end + 4'd1 : begin f_mvyh <= -2'sd1; f_mvxh <= 2'sd0; f_inter <= 1'b1; end + 4'd2 : begin f_mvyh <= -2'sd1; f_mvxh <= 2'sd1; f_inter <= 1'b1; end + 4'd3 : begin f_mvyh <= 2'sd0; f_mvxh <= -2'sd1; f_inter <= 1'b1; end + 4'd4 : begin f_mvyh <= 2'sd0; f_mvxh <= 2'sd0; f_inter <= 1'b1; end + 4'd5 : begin f_mvyh <= 2'sd0; f_mvxh <= 2'sd1; f_inter <= 1'b1; end + 4'd6 : begin f_mvyh <= 2'sd1; f_mvxh <= -2'sd1; f_inter <= 1'b1; end + 4'd7 : begin f_mvyh <= 2'sd1; f_mvxh <= 2'sd0; f_inter <= 1'b1; end + 4'd8 : begin f_mvyh <= 2'sd1; f_mvxh <= 2'sd1; f_inter <= 1'b1; end + default : begin f_mvyh <= 2'sd0; f_mvxh <= 2'sd0; f_inter <= 1'b0; end + endcase + end + + // state: 1 cycle -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + CALC_MIN_HALF2 : begin + if( f_i_frame == '0 ) begin // I-frame + f_inter <= 1'b0; + f_mvyh <= '0; + f_mvxh <= '0; + f_mvy <= '0; + f_mvx <= '0; + end else begin // P-frame + f_mvy <= (f_mvy << 1) + f_mvyh; + f_mvx <= (f_mvx << 1) + f_mvxh; + end + + for (int y=-1; y<16; y++) + for(int x=-1; x<16; x++) begin + if(-2= 2'sd0 || f_cnt == 4'd1 && f_mvyh >= 2'sd1 ) begin // up shift Y-half (f_Y_hlf) + for (int y=-1; y<31; y++) + for(int x=-1; x<32; x++) + f_Y_hlf[y][x] <= f_Y_hlf[y+1][x]; + end + + if ( f_mvy > 5'sd0 && (5)'(f_cnt) < $unsigned( f_mvy>>>2 ) ) // up shift U/V + for (int y=1 ; y<8+UR; y++) // needn't to shift the pixels of y<0, since they are discarded + for (int x=-UR; x<8+UR; x++) begin + f_U_prd[y-1][x] <= f_U_prd[y][x] ; + f_V_prd[y-1][x] <= f_V_prd[y][x] ; + end + else if ( f_mvy < 5'sd0 && (5)'(f_cnt) < $unsigned(-(f_mvy>>>2)) ) // down shift V/V + for (int y=-UR; y<8 ; y++) // needn't to shift the pixels of y>8 , since they are discarded + for (int x=-UR; x<8+UR; x++) begin + f_U_prd[y+1][x] <= f_U_prd[y][x] ; + f_V_prd[y+1][x] <= f_V_prd[y][x] ; + end + end + + // state: 3 cycle -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + REF_UV_SHIFT_X : begin + if( f_cnt == 4'd0 && f_mvxh >= 2'sd0 || f_cnt == 4'd1 && f_mvxh >= 2'sd1 ) begin // left shift Y-half (f_Y_hlf) + for (int y=-1; y<30; y++) // needn't to shift y>=30, since they are discarded + for(int x=-1; x<31; x++) + f_Y_hlf[y][x] <= f_Y_hlf[y][x+1]; + end + + if ( f_mvx > 5'sd0 && (5)'(f_cnt) < $unsigned( f_mvx>>>2 ) ) // left shift U/V + for (int y=0; y<=8 ; y++) // needn't to shift the pixels of y<0 and y>8, since they are discarded + for (int x=1; x<8+UR; x++) begin // needn't to shift the pixels of x<0, since they are discarded + f_U_prd[y][x-1] <= f_U_prd[y][x] ; + f_V_prd[y][x-1] <= f_V_prd[y][x] ; + end + else if ( f_mvx < 5'sd0 && (5)'(f_cnt) < $unsigned(-(f_mvx>>>2)) ) // right shift U/V + for (int y=0; y<=8 ; y++) // needn't to shift the pixels of y<0 and y>8, since they are discarded + for (int x=-UR; x<8; x++) begin // needn't to shift the pixels of x>8, since they are discarded + f_U_prd[y][x+1] <= f_U_prd[y][x] ; + f_V_prd[y][x+1] <= f_V_prd[y][x] ; + end + end + + // state: 1 cycle -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + PREDICT : begin + for (int y=0; y<16; y++) + for (int x=0; x<16; x++) + if ( ~f_inter ) + f_Y_prd[y][x] <= 8'h80; + else + f_Y_prd[y][x] <= f_Y_hlf[2*y-1][2*x-1]; + + for (int y=0; y<8; y++) + for (int x=0; x<8; x++) + if( ~f_inter ) begin + f_U_prd[y][x] <= 8'h80; + f_V_prd[y][x] <= 8'h80; + end else if ( ((f_mvy>>>1) & 1) & ((f_mvx>>>1) & 1) ) begin + f_U_prd[y][x] <= mean4( f_U_prd[y][x], f_U_prd[y][x+1], f_U_prd[y+1][x], f_U_prd[y+1][x+1] ) ; + f_V_prd[y][x] <= mean4( f_V_prd[y][x], f_V_prd[y][x+1], f_V_prd[y+1][x], f_V_prd[y+1][x+1] ) ; + end else if ( (f_mvx>>>1) & 1 ) begin + f_U_prd[y][x] <= mean2( f_U_prd[y][x], f_U_prd[y][x+1] ) ; + f_V_prd[y][x] <= mean2( f_V_prd[y][x], f_V_prd[y][x+1] ) ; + end else if ( (f_mvy>>>1) & 1 ) begin + f_U_prd[y][x] <= mean2( f_U_prd[y][x], f_U_prd[y+1][x] ) ; + f_V_prd[y][x] <= mean2( f_V_prd[y][x], f_V_prd[y+1][x] ) ; + end else begin + f_U_prd[y][x] <= f_U_prd[y][x]; + f_V_prd[y][x] <= f_V_prd[y][x]; + end + end + endcase + + + + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// stage G : DCT, including phase 1 (right multiply DCT_MATRIX_transposed) and phase 2 (left multiply DCT_MATRIX), then quantize. +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +reg [ 5:0] g_cnt; + +reg [ 7:0] g_i_frame; +reg [ XB16-1:0] g_x16; +reg [ YB16-1:0] g_y16; +reg g_inter; +reg signed [ 4:0] g_mvx , g_mvy ; + +reg [ 7:0] g_tiles_prd [48][8]; // predicted tiles of current block : Y00, Y01, Y10, Y11, U, V +reg signed [ 8:0] g_tiles [48][8]; // residual tiles of current block : Y00, Y01, Y10, Y11, U, V + +reg signed [ 18+DCTP:0] g_dct_res1 [8][8]; // 21 bits = 9+10+3-1 +reg signed [ 18+DCTP:0] g_dct_res2 [8][8]; // 21 bits +reg signed [ 16:0] g_dct_res3 [8][8]; // 17 bits = 21+10+3-1-16 +reg signed [ 11:0] g_quant [8][8]; // 12 bits + +reg g_en_tile ; +reg [ 2:0] g_num_tile ; + +logic signed [ 18+DCTP:0] g_t1; // temporary variable not real register +logic signed [28+2*DCTP:0] g_t2; // temporary variable not real register +logic [ 15:0] g_t3; // temporary variable not real register + + +always @ (posedge clk or negedge rstn) + if (~rstn) begin + g_cnt <= '0; + g_en_tile <= 1'b0; + g_num_tile <= '0; + end else begin + if( f_en_blk ) + g_cnt <= 6'd1; + else if (g_cnt != '0) + g_cnt <= g_cnt + 6'd1; + + g_en_tile <= 1'b0; + if( g_cnt == 6'd18 || g_cnt == 6'd26 || g_cnt == 6'd34 || g_cnt == 6'd42 || g_cnt == 6'd50 || g_cnt == 6'd58 ) begin + g_en_tile <= 1'b1; + g_num_tile <= g_cnt[5:3] - 3'd2; // 0->Y00 1->Y01 2->Y10 3->Y11 4->U 5->V + end + end + + +always @ (posedge clk) + if( f_en_blk ) begin + g_i_frame <= f_i_frame; + g_y16 <= f_y16; + g_x16 <= f_x16; + g_inter <= f_inter; + g_mvx <= f_mvx; + g_mvy <= f_mvy; + + for (int y=0; y<8 ; y++) + for (int x=0; x<8 ; x++) begin + g_tiles_prd[y ][x ] <= f_Y_prd[y][x]; + g_tiles [y ][x ] <= $signed( (9)'(f_Y_blk[y][x]) ) - $signed( (9)'(f_Y_prd[y][x]) ); + end + + for (int y=0; y<8 ; y++) + for (int x=8; x<16; x++) begin + g_tiles_prd[y+8 ][x-8] <= f_Y_prd[y][x]; + g_tiles [y+8 ][x-8] <= $signed( (9)'(f_Y_blk[y][x]) ) - $signed( (9)'(f_Y_prd[y][x]) ); + end + + for (int y=8; y<16; y++) + for (int x=0; x<8 ; x++) begin + g_tiles_prd[y+8 ][x ] <= f_Y_prd[y][x]; + g_tiles [y+8 ][x ] <= $signed( (9)'(f_Y_blk[y][x]) ) - $signed( (9)'(f_Y_prd[y][x]) ); + end + + for (int y=8; y<16; y++) + for (int x=8; x<16; x++) begin + g_tiles_prd[y+16][x-8] <= f_Y_prd[y][x]; + g_tiles [y+16][x-8] <= $signed( (9)'(f_Y_blk[y][x]) ) - $signed( (9)'(f_Y_prd[y][x]) ); + end + + for (int y=0; y<8; y++) + for (int x=0; x<8; x++) begin + g_tiles_prd[y+32][x ] <= f_U_prd[y][x]; + g_tiles [y+32][x ] <= $signed( (9)'(f_U_blk[y][x]) ) - $signed( (9)'(f_U_prd[y][x]) ); + end + + for (int y=0; y<8; y++) + for (int x=0; x<8; x++) begin + g_tiles_prd[y+40][x ] <= f_V_prd[y][x]; + g_tiles [y+40][x ] <= $signed( (9)'(f_V_blk[y][x]) ) - $signed( (9)'(f_V_prd[y][x]) ); + end + + end else begin + for (int x=0; x<8 ; x++) begin + for (int y=0; y<47; y++) + g_tiles[y][x] <= g_tiles[y+1][x]; // up shift g_tiles + g_tiles[47][x] <= '0; + end + end + + +always @ (posedge clk) begin + // DCT phase 1 : right multiply DCT_MATRIX_transposed + // calculate when g_cnt = 1~8, 9~16, 17~24, 25~32, 33~40, 41~48 + // produce result when g_cnt = 9, 17, 25, 33, 41, 49 + for (int j=0; j<8; j++) begin + g_t1 = '0; + for (int k=0; k<8; k++) + g_t1 += g_tiles[0][k] * DCT_MATRIX[j][k]; // Note that DCT_MATRIX [j][k] == DCT_MATRIX_transposed [k][j] + g_dct_res1[7][j] <= g_t1; // push the DCT phase 1 result to the last row of g_dct_res1 + for (int i=0; i<7; i++) + g_dct_res1[i][j] <= g_dct_res1[i+1][j]; // up shift g_dct_res1 + end + + // save the 8x8 result of DCT phase 1 + if( g_cnt == 6'd9 || g_cnt == 6'd17 || g_cnt == 6'd25 || g_cnt == 6'd33 || g_cnt == 6'd41 || g_cnt == 6'd49 ) begin + for (int i=0; i<8; i++) + for (int j=0; j<8; j++) + g_dct_res2[i][j] <= g_dct_res1[i][j] ; // save the 8x8 result of DCT phase 1 to g_dct_res2 + end else begin + for (int i=0; i<8; i++) begin + for (int j=0; j<7; j++) + g_dct_res2[i][j] <= g_dct_res2[i][j+1]; // left shift g_dct_res2 + g_dct_res2[i][7] <= '0; + end + end + + // DCT phase 2 : left multiply DCT_MATRIX + // calculate when g_cnt = 10~17, 18~25, 26~33, 34~41, 42~49, 50~57 + // produce result when g_cnt = 18, 26, 34, 42, 50, 58 + for (int i=0; i<8; i++) begin + g_t2 = '0; + for(int k=0; k<8; k++) + g_t2 += DCT_MATRIX[i][k] * g_dct_res2[k][0]; + g_t2 = (g_t2>>>(12+2*DCTP)) + g_t2[11+2*DCTP]; // + g_dct_res3[i][7] <= $signed((17)'(g_t2)); // push the DCT phase 2 result to the last column of g_dct_res3. = (g_t2 + 32768) / 65536 + for(int j=0; j<7; j++) + g_dct_res3[i][j] <= g_dct_res3[i][j+1]; // left shift g_dct_res3 + end + + // save the 8x8 result of DCT phase 2, and do quantize by-the-way + if( g_cnt == 6'd18 || g_cnt == 6'd26 || g_cnt == 6'd34 || g_cnt == 6'd42 || g_cnt == 6'd50 || g_cnt == 6'd58 ) + for (int i=0; i<8; i++) + for (int j=0; j<8; j++) begin + g_t3 = (16)'( (g_dct_res3[i][j] < 0) ? -g_dct_res3[i][j] : g_dct_res3[i][j] ); // y = abs(x) + if( g_inter ) // inter block + g_t3 = (g_t3 + 16'd2) >> (4 + Q_LEVEL); // y = (y+2) / 16 / (1<>3) ) >> Q_LEVEL ) / INTRA_Q[i][j]; // y = ( y + (INTRA_Q*((3<>3) ) / (1<>4) + (16)'( g_t3[3] ) ; // y = (y/8 + 1) / 2 + if( g_t3 > 16'd2047 ) g_t3 = 16'd2047; // clip(y, 0, 2047) + g_quant[i][j] <= (g_dct_res3[i][j] < 0) ? -$signed((12)'(g_t3)) : $signed((12)'(g_t3)); // x = (y<0) ? -x : x; + end +end + + + + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// stage H & J : inverse quantize, inverse DCT phase 1 +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +reg [ 2:0] h_num_tile; +reg h_en ; +reg [ 2:0] h_cnt ; +reg signed [ 12:0] h_iquant [8][8]; // 13 bit + +logic signed [ 16:0] h_t1; // not real register + +reg j1_en ; +reg [ 2:0] j1_num_tile ; +reg j1_en_tile ; +reg [ 32*9-1:0] j1_idct_x0_to_x8; + +always @ (posedge clk or negedge rstn) + if (~rstn) begin + h_en <= 1'b0; + h_cnt <= '0; + h_num_tile <= '0; + j1_en <= '0; + j1_num_tile <= '0; + j1_en_tile <= 1'b0; + end else begin + j1_en_tile <= 1'b0; + if (g_en_tile) begin + h_en <= 1'b1; + h_cnt <= '0; + h_num_tile <= g_num_tile; + end else begin + h_cnt <= h_cnt + 3'd1; + if(h_cnt == '1) + h_en <= 1'b0; + end + j1_en <= h_en; + if(h_en) begin + if(h_cnt == '1) begin + j1_en_tile <= 1'b1; + j1_num_tile <= h_num_tile; + end + end + end + +always @ (posedge clk) + if(g_en_tile) begin + for (int i=0; i<8; i++) begin + for (int j=0; j<8; j++) begin + h_t1 = g_quant[i][j]; // inverse quantize + if( g_inter ) begin // inter block + h_t1 <<= 1; // x *= 2 + h_t1 += (h_t1<0) ? -17'sd1 : (h_t1>0) ? 17'sd1 : 17'sd0 ; // x += sign(x) + h_t1 <<= (17)'(Q_LEVEL); // x *= (1< 17'sd2047) ? 17'sd2047 : h_t1; // clip(x, -2047, 2047) + end else if( i!=0 || j!=0 ) begin // intra block, AC value + h_t1 *= INTRA_Q[i][j]; // x *= INTRA_Q + if( Q_LEVEL >= 3 ) // x = x * (1<>>= (17)'(3 - Q_LEVEL); // + h_t1 = (h_t1 < -17'sd2047) ? -17'sd2047 : (h_t1 > 17'sd2047) ? 17'sd2047 : h_t1; // clip(x, -2047, 2047) + end else begin // intra block, DC value + h_t1 <<= 1; // x *= 2 + end + h_iquant[i][j] <= (13)'(h_t1); + end + end + end else begin + for (int j=0; j<8; j++) begin + for (int i=0; i<7; i++) + h_iquant[i][j] <= h_iquant[i+1][j]; // up shift h_iquant by 1 step + h_iquant[7][j] <= '0; + end + end + +always @ (posedge clk) + if(h_en) // inverse DCT + j1_idct_x0_to_x8 <= invserse_dct_rows_step12(h_iquant[0][0], h_iquant[0][1], h_iquant[0][2], h_iquant[0][3], h_iquant[0][4], h_iquant[0][5], h_iquant[0][6], h_iquant[0][7]); + + + + +// divide invserse_dct_rows to 2 pipeline stages : for better timing ----------------------------------------------------------------------------------------- + +reg [ 2:0] j_num_tile; +reg j_en_tile ; +reg signed [ 17:0] j_idct_res1 [8][8]; + +always @ (posedge clk or negedge rstn) + if (~rstn) begin + j_num_tile <= '0; + j_en_tile <= '0; + end else begin + if (j1_en) begin + j_num_tile <= j1_num_tile; + j_en_tile <= j1_en_tile; + end + end + +always @ (posedge clk) + if (j1_en) begin + {j_idct_res1[7][0], j_idct_res1[7][1], j_idct_res1[7][2], j_idct_res1[7][3], j_idct_res1[7][4], j_idct_res1[7][5], j_idct_res1[7][6], j_idct_res1[7][7]} <= invserse_dct_rows_step34(j1_idct_x0_to_x8); + for (int i=0; i<7; i++) + for (int j=0; j<8; j++) + j_idct_res1[i][j] <= j_idct_res1[i+1][j]; // up shift j_idct_res1 by 1 step + end + + + + + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// stage K & M : inverse DCT phase 2 +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +reg [ 2:0] k_num_tile; +reg k_en ; +reg [ 2:0] k_cnt; +reg signed [ 17:0] k_idct_res2 [8][8]; + +reg m1_en; +reg m1_idct_en3; +reg [ 2:0] m1_num_tile; +reg [ 32*9-1:0] m1_idct_x0_to_x8; + +always @ (posedge clk or negedge rstn) + if (~rstn) begin + k_num_tile <= '0; + k_en <= '0; + k_cnt <= '0; + m1_en <= '0; + m1_idct_en3 <= '0; + m1_num_tile <= '0; + end else begin + m1_idct_en3 <= '0; + if( j_en_tile ) begin + k_num_tile <= j_num_tile; + k_en <= 1'b1; + k_cnt <= '0; + end else begin + k_cnt <= k_cnt + 3'd1; + if(k_cnt == '1) + k_en <= 1'b0; + end + m1_en <= k_en; + if(k_en) begin + if(k_cnt == '1) begin + m1_idct_en3 <= 1'b1; + m1_num_tile <= k_num_tile; + end + end + end + +always @ (posedge clk) begin // for inverse DCT stage 2 + if( j_en_tile ) begin + for (int i=0; i<8; i++) + for (int j=0; j<8; j++) + k_idct_res2[i][j] <= j_idct_res1[i][j]; + end else begin + for (int i=0; i<8; i++) begin + for (int j=0; j<7; j++) + k_idct_res2[i][j] <= k_idct_res2[i][j+1]; // left shift k_idct_res2 for 2 steps + k_idct_res2[i][7] <= '0; + end + end + if(k_en) + m1_idct_x0_to_x8 <= invserse_dct_cols_step12(k_idct_res2[0][0], k_idct_res2[1][0], k_idct_res2[2][0], k_idct_res2[3][0], k_idct_res2[4][0], k_idct_res2[5][0], k_idct_res2[6][0], k_idct_res2[7][0]); +end + + + +// divide invserse_dct_cols to 2 pipeline stages : for better timing ----------------------------------------------------------------------------------------- + +reg signed [ 8:0] m_idct_res3 [8][8]; +reg m_idct_en3; +reg [ 2:0] m_num_tile; + +always @ (posedge clk or negedge rstn) + if (~rstn) begin + m_idct_en3 <= '0; + m_num_tile <= '0; + end else begin + if (m1_en) begin + m_idct_en3 <= m1_idct_en3; + m_num_tile <= m1_num_tile; + end + end + +always @ (posedge clk) + if (m1_en) begin + {m_idct_res3[0][7], m_idct_res3[1][7], m_idct_res3[2][7], m_idct_res3[3][7], m_idct_res3[4][7], m_idct_res3[5][7], m_idct_res3[6][7], m_idct_res3[7][7]} <= invserse_dct_cols_step34(m1_idct_x0_to_x8); + for (int i=0; i<8; i++) + for (int j=0; j<7; j++) + m_idct_res3[i][j] <= m_idct_res3[i][j+1]; // left shift m_idct_res3 by 1 step + end + + + + + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// stage N & P : +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +reg [ XB16-1:0] n_x16 ; +reg [ YB16-1:0] n_y16 ; +reg [ 5:0] n_num_tiles_line ; + +reg [ 7:0] n_tiles_prd [48][8]; // predicted block : Y/U/V tiles + +reg signed [ 8:0] n_idct_res4 [8][8]; +reg n_en ; +reg [ 2:0] n_cnt ; + +reg [ 8*8-1:0] p_delay_mem_wdata; +reg p_en ; +reg [ XB16-1:0] p_x16 ; +reg [ YB16-1:0] p_y16 ; +reg [ 5:0] p_num_tiles_line ; // 0~47 + +always @ (posedge clk or negedge rstn) + if (~rstn) begin + n_en <= '0; + n_cnt <= '0; + p_en <= '0; + end else begin + if(m_idct_en3) begin + n_en <= 1'b1; + n_cnt <= '0; + end else begin + n_cnt <= n_cnt + 3'd1; + if(n_cnt == '1) + n_en <= 1'b0; + end + p_en <= n_en; + end + +always @ (posedge clk) begin + if(m_idct_en3) begin + for (int y=0; y<8; y++) + for (int x=0; x<8; x++) + n_idct_res4[y][x] <= m_idct_res3[y][x]; + end else begin + for (int x=0; x<8; x++) begin + for (int y=0; y<7; y++) + n_idct_res4[y][x] <= n_idct_res4[y+1][x]; // up shift n_idct_res4 + n_idct_res4[7][x] <= '0; + end + end + + if(m_idct_en3 && m_num_tile == '0) begin // for the first tile in a block, save the predicted block + for (int y=0; y<48; y++) + for (int x=0; x<8; x++) + n_tiles_prd[y][x] <= g_tiles_prd[y][x]; // save the predicted block + n_x16 <= g_x16; + n_y16 <= g_y16; + n_num_tiles_line <= '0; + end else if(n_en) begin + for (int y=0; y<47; y++) + for (int x=0; x<8; x++) + n_tiles_prd[y][x] <= n_tiles_prd[y+1][x]; // up shift n_tiles_prd + n_num_tiles_line <= n_num_tiles_line + 6'd1; + end + + if(n_en) begin + for (int x=0; x<8; x++) + p_delay_mem_wdata[8*x+:8] <= add_clip_0_255( n_tiles_prd[0][x] , n_idct_res4[0][x] ) ; + p_x16 <= n_x16; + p_y16 <= n_y16; + p_num_tiles_line <= n_num_tiles_line; + end +end + + + + + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// stage Q & R : use memory (mem_delay) to delay for a slice, and then write back to reference memory +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +reg [ 8*8-1:0] mem_delay [ 48 * (XSIZE/16) ]; // a memory to save a slice, to delay the write of mem_ref_Y & mem_ref_UV for a slice + +always @ (posedge clk) + if (p_en) + mem_delay[{p_num_tiles_line, p_x16}] <= p_delay_mem_wdata; + +reg [ 8*8-1:0] q_rd; +reg q_en ; +reg [ XB16-1:0] q_x16 ; +reg [ YB16-1:0] q_y16 ; +reg [ 5:0] q_num_tiles_line ; + +reg [ 8*8-1:0] r_rd; +reg r_en ; +reg [ XB16-1:0] r_x16 ; +reg [ YB16-1:0] r_y16 ; +reg [ 5:0] r_num_tiles_line ; + +always @ (posedge clk) + q_rd <= mem_delay[{p_num_tiles_line, p_x16}]; + +always @ (posedge clk) + r_rd <= q_rd; + +always @ (posedge clk or negedge rstn) + if (~rstn) begin + q_en <= '0; + q_x16 <= '0; + q_y16 <= '0; + q_num_tiles_line <= '0; + end else begin + q_en <= p_en; + q_x16 <= p_x16; + q_y16 <= p_y16; + q_num_tiles_line <= p_num_tiles_line; + end + +always @ (posedge clk or negedge rstn) + if (~rstn) begin + r_en <= '0; + r_x16 <= '0; + r_y16 <= '0; + r_num_tiles_line <= '0; + end else begin + r_en <= q_en; + r_x16 <= q_x16; + r_y16 <= (q_y16 == '0) ? max_y16 : q_y16 - (YB16)'(1) ; // set the write block to the upper slice + r_num_tiles_line <= q_num_tiles_line; + end + +always @ (posedge clk) + if( r_en && ~r_num_tiles_line[5] ) + mem_ref_Y [ {r_y16, r_num_tiles_line[4], r_num_tiles_line[2:0], r_x16, r_num_tiles_line[3]} ] <= r_rd; // write to Y reference frame memory + +always @ (posedge clk) + if( r_en && r_num_tiles_line[5] ) + mem_ref_UV[ {r_y16 , r_num_tiles_line[2:0], r_x16, r_num_tiles_line[3]} ] <= r_rd; // write to U/V reference frame memory + + +//reg [8*8-1:0] mem_ref_Y [ (YSIZE ) * (XSIZE/8 ) ]; // Y reference frame memory : (YSIZE ) rows, XSIZE/8 cols , each item contains 8 Y pixels +//reg [8*8-1:0] mem_ref_UV [ (YSIZE/2) * (XSIZE/16) * 2 ]; // U/V reference frame memory : (YSIZE/2) rows, XSIZE/16 cols, 2 channels (U/V), each item contains 8 U or V pixels + + + + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// stage S : zig-zag reorder, generate nzflags +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +logic s_nzflag ; // temporary variable +reg [ 5:0] s_nzflags ; +reg signed [ 11:0] s_zig_blk [6] [64]; // 12 bit +reg s_en_blk ; + +always @ (posedge clk or negedge rstn) + if (~rstn) begin + s_en_blk <= 1'b0; + end else begin + s_en_blk <= 1'b0; + if(g_en_tile) + s_en_blk <= ( g_num_tile == 3'd5 ); // is the last tile in a block ? + end + +always @ (posedge clk) + if(g_en_tile) begin + for (int i=0; i<64; i++) begin + s_zig_blk[0][i] <= s_zig_blk[1][i]; + s_zig_blk[1][i] <= s_zig_blk[2][i]; + s_zig_blk[2][i] <= s_zig_blk[3][i]; + s_zig_blk[3][i] <= s_zig_blk[4][i]; + s_zig_blk[4][i] <= s_zig_blk[5][i]; + end + s_nzflag = ~g_inter; + for (int i=0; i<8; i++) + for (int j=0; j<8; j++) begin + s_zig_blk[5][ZIG_ZAG_TABLE[i][j]] <= g_quant[i][j]; // zig-zag reorder + s_nzflag |= (g_quant[i][j] != '0); // check if g_quant are all zero + end + s_nzflags <= (s_nzflags<<1) | (6)'(s_nzflag); + end + + + + + + + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// stage T : MPEG2 stream generation +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +reg [ 5:0] t_frame_hour, t_frame_minute, t_frame_second, t_frame_insec; // (hour,minute,second,insec) = (0~63,0~59,0~59,0~23) + +reg [ 7:0] t_i_frame ; +reg [ XB16-1:0] t_x16 ; +reg [ YB16-1:0] t_y16 ; +reg t_inter ; +reg [ 5:0] t_nzflags ; +reg signed [ 4:0] t_mvx, t_mvy; +reg signed [ 4:0] t_prev_mvx, t_prev_mvy; +reg signed [ 11:0] t_zig_blk [6] [64]; +reg signed [ 11:0] t_prev_Y_dc, t_prev_U_dc, t_prev_V_dc; +reg [ 5:0] t_runlen ; + +reg [ 2:0] t_num_tile ; +reg [ 3:0] t_cnt ; +enum reg [ 2:0] {PUT_ENDED, PUT_SEQ_HEADER2, PUT_IDLE, PUT_FRAME_HEADER, PUT_SLICE_HEADER, PUT_BLOCK_INFO, PUT_TILE} t_stat; + +reg t_end_seq ; +reg t_align ; +reg [ 23:0] t_bits [7]; +reg [ 4:0] t_lens [7]; +reg t_append_b10 ; + + +logic signed [ 6:0] dmv; // temporary variable, not real register +logic [ 4:0] dmvabs; // temporary variable, not real register +logic nzflag; // temporary variable, not real register +logic signed [ 11:0] val; // temporary variable, not real register +logic signed [ 12:0] diff_dc; // temporary variable, not real register +logic [ 11:0] tmp_val; // temporary variable, not real register +logic [ 3:0] vallen; // temporary variable, not real register +logic [ 5:0] runlen; // temporary variable, not real register + + +function automatic logic [24+5-1:0] put_AC (input logic signed [11:0] v, input logic [5:0] rl); // because of run-length encoding, v cannot be zero + logic [23:0] bits; + logic [ 4:0] lens; + logic [10:0] absv; + absv = (v < 12'sd0) ? (11)'($unsigned(-v)) : (11)'($unsigned(v)); + absv --; + if ( rl == 0 && absv < 40 || rl == 1 && absv < 18 || rl == 2 && absv < 5 || rl == 3 && absv < 4 ) begin + bits = { BITS_AC_0_3[rl][absv], (1)'(v<12'sd0)}; + lens = LENS_AC_0_3[rl][absv] + 5'd1; + end else if( rl <= 6 && absv < 3 || rl <= 16 && absv < 2 || rl <= 31 && absv < 1 ) begin + bits = {BITS_AC_4_31[rl][absv], (1)'(v<12'sd0)}; + lens = LENS_AC_4_31[rl][absv] + 5'd1; + end else begin + bits = { 6'h1, rl, (12)'($unsigned(v)) }; + lens = 5'd24; + end + return {bits, lens}; +endfunction + + +always @ (posedge clk or negedge rstn) + if (~rstn) begin + {t_frame_hour, t_frame_minute, t_frame_second, t_frame_insec} <= '0; + t_i_frame <= '0; + t_x16 <= '0; + t_y16 <= '0; + t_inter <= '0; + t_nzflags <= '0; + t_mvx <= '0; + t_mvy <= '0; + t_prev_mvx <= '0; + t_prev_mvy <= '0; + {t_prev_Y_dc, t_prev_U_dc, t_prev_V_dc} <= '0; + t_runlen <= '0; + t_num_tile <= '0; + t_cnt <= '0; + t_stat <= PUT_ENDED; + + t_end_seq <= '0; + t_align <= '0; + for(int i=0; i<7; i++) begin + t_bits[i] <= '0; + t_lens[i] <= '0; + end + t_append_b10 <= '0; + end else begin + t_runlen <= '0; + + t_num_tile <= '0; + t_cnt <= '0; + + t_end_seq <= '0; + t_align <= '0; + for(int i=0; i<7; i++) begin + t_bits[i] <= '0; + t_lens[i] <= '0; + end + t_append_b10 <= '0; + + case(t_stat) + PUT_ENDED : begin + if(sequence_start) begin + t_stat <= PUT_SEQ_HEADER2; + + {t_frame_hour, t_frame_minute, t_frame_second, t_frame_insec} <= '0; // clear time code + + t_align <= 1'b1; + t_bits <= '{ 'h000001, 'hB3, {size_x, size_y}, 'h1209c4, 'h200000, 'h0001B5, 'h144200 }; // sequence header : part 1 (152 bits) + t_lens <= '{ 24, 8, 24, 24, 24, 24, 24 }; + end + end + + PUT_SEQ_HEADER2 : begin + t_stat <= PUT_IDLE; + + t_bits <= '{ 'h010000, 'h000001, 'hB52305, 'h0505, size_x, 1'b1, size_y }; // sequence header : part 2 (117 bits) + t_lens <= '{ 24, 24, 24, 16, 14, 1, 14 }; + end + + PUT_IDLE : begin + if( t_y16 == max_y16 && t_x16 == max_x16 && sequence_state == SEQ_ENDED) begin + t_stat <= PUT_ENDED; + t_end_seq <= 1'b1; + t_align <= 1'b1; + t_bits[0] <= 'h000001; + t_lens[0] <= 24; + t_bits[1] <= 'hB7; // sequence end + t_lens[1] <= 8; + + end else if( s_en_blk ) begin + t_i_frame <= g_i_frame; + t_y16 <= g_y16; + t_x16 <= g_x16; + t_inter <= g_inter; + t_mvx <= g_mvx; + t_mvy <= g_mvy; + t_nzflags <= s_nzflags; + + t_stat <= PUT_BLOCK_INFO; + + if( g_x16 == '0 ) begin // start of slice + t_stat <= PUT_SLICE_HEADER; + if( g_y16 == '0 ) begin // start of frame + t_stat <= PUT_FRAME_HEADER; + if( g_i_frame == '0 ) begin // start of GOP + t_align <= 1'b1; + t_bits <= '{ 'h000001, 'hB8, t_frame_hour, t_frame_minute, {1'b1, t_frame_second}, t_frame_insec, 'h2 }; // GOP header (59 bits) + t_lens <= '{ 24, 8, 6, 6, 7, 6, 2 }; + end + end + end + end + end + + PUT_FRAME_HEADER : begin + t_stat <= PUT_SLICE_HEADER; + + t_align <= 1'b1; + t_bits <= '{ 'h000001, t_i_frame, 'h10000, 'h0, 'h000001, 'hB58111, 'h1BC000 }; // frame header (136 bits for I-frame, 144 bits for P-frame) + t_lens <= '{ 24, 18, 19, 3, 24, 24, 24 }; + + if ( t_i_frame != '0 ) begin // for P-frame + t_bits[2] <= 'h20000; + t_bits[3] <= 'h380; + t_lens[3] <= 11; + end + + // for new frame, update time code --------------------------- + t_frame_insec <= t_frame_insec + 6'd1; + if( t_frame_insec == 6'd23 ) begin + t_frame_insec <= '0; + t_frame_second <= t_frame_second + 6'd1; + if( t_frame_second == 6'd59 ) begin + t_frame_second <= '0; + t_frame_minute <= t_frame_minute + 6'd1; + if( t_frame_minute == 6'd59 ) begin + t_frame_minute <= '0; + if( t_frame_hour < 6'd63 ) + t_frame_hour <= t_frame_hour + 6'd1; + end + end + end + end + + PUT_SLICE_HEADER : begin + t_stat <= PUT_BLOCK_INFO; + + t_align <= 1'b1; + t_bits <= '{ 'h000001, 1+t_y16, (2< 7'sd15) + dmv -= 7'sd32; + else if (dmv < -7'sd16) + dmv += 7'sd32; + dmvabs = (dmv < 7'sd0) ? (5)'($unsigned(-dmv)) : (5)'($unsigned(dmv)) ; + t_bits[1] <= BITS_MOTION_VECTOR[dmvabs]; + t_lens[1] <= LENS_MOTION_VECTOR[dmvabs]; + if (dmv != 7'sd0) begin + t_bits[2] <= (1)'(dmv < 7'sd0); + t_lens[2] <= 1; + end + + // put motion vector y ------------------------------------------------------------------ + dmv = t_mvy; + dmv -= t_prev_mvy; + if (dmv > 7'sd15) + dmv -= 7'sd32; + else if (dmv < -7'sd16) + dmv += 7'sd32; + dmvabs = (dmv < 7'sd0) ? (5)'($unsigned(-dmv)) : (5)'($unsigned(dmv)) ; + t_bits[3] <= BITS_MOTION_VECTOR[dmvabs]; + t_lens[3] <= LENS_MOTION_VECTOR[dmvabs]; + if (dmv != 7'sd0) begin + t_bits[4] <= (1)'(dmv < 7'sd0); + t_lens[4] <= 1; + end + + // put nzflags ------------------------------------------------------------------ + t_bits[5] <= BITS_NZ_FLAGS[t_nzflags]; + t_lens[5] <= LENS_NZ_FLAGS[t_nzflags]; + + t_prev_mvx <= t_mvx; + t_prev_mvy <= t_mvy; + end else begin // for intra block, clear the previous motion vector + t_prev_mvx <= '0; + t_prev_mvy <= '0; + end + end + + PUT_TILE : begin + + nzflag = t_nzflags[5]; + + if (t_cnt == 4'd0) begin // DC value + val = t_zig_blk[0][0]; // val <- DC value + diff_dc = val; + if (t_num_tile < 3'd4) begin + diff_dc -= t_prev_Y_dc; + t_prev_Y_dc <= t_inter ? '0 : val; // save the DC value as the previous Y DC value for next tile + end else if (t_num_tile == 3'd4) begin + diff_dc -= t_prev_U_dc; + t_prev_U_dc <= t_inter ? '0 : val; // save the DC value as the previous U DC value for next tile + end else begin + diff_dc -= t_prev_V_dc; + t_prev_V_dc <= t_inter ? '0 : val; // save the DC value as the previous V DC value for next tile + end + + if (t_inter) begin // put DC value (INTER) + if (val == '0) begin + t_runlen <= 6'd1; + end else if( val == 12'sd1 || val == -12'sd1 ) begin + if (nzflag) begin + t_bits[0] <= { 1'b1, (1)'(val<12'sd0) }; + t_lens[0] <= 2; + end + end else begin + if (nzflag) + {t_bits[0], t_lens[0]} <= put_AC(val, 6'd0); + end + end else begin // put DC value (INTRA) + tmp_val = (12)'($unsigned( (diff_dc < 13'sd0) ? -diff_dc : diff_dc )); + vallen = '0; + for (int i=0; i<12; i++) + if (tmp_val[i]) + vallen = (4)'(i+1); + tmp_val = (12)'($unsigned(diff_dc)); + if (diff_dc < 13'sd0) + tmp_val += ((12'd1 << vallen) - 12'd1); + if (nzflag) begin + t_bits[0] <= (t_num_tile < 3'd4) ? BITS_DC_Y[vallen] : BITS_DC_UV[vallen]; + t_lens[0] <= (t_num_tile < 3'd4) ? LENS_DC_Y[vallen] : LENS_DC_UV[vallen]; + t_bits[1] <= tmp_val; + t_lens[1] <= vallen; + end + end + end else begin // AC value + runlen = t_runlen; + for(int i=0; i<7; i++) begin + val = t_zig_blk[0][i+1]; + if (val != 12'sd0) begin + if (nzflag) + {t_bits[i], t_lens[i]} <= put_AC(val, runlen); + runlen = 6'd0; + end else + runlen ++; + end + t_runlen <= runlen; + t_append_b10 <= nzflag && (t_cnt == 4'd9); // for the last cycle of a tile, append 2'b10 to the MPEG2 stream + end + + if (t_cnt < 4'd9) begin // NOT the last cycle of a tile + t_cnt <= t_cnt + 4'd1; + t_num_tile <= t_num_tile; + end else begin // the last cycle of a tile + t_num_tile <= t_num_tile + 3'd1; + if (t_num_tile == 3'd5) // the last tile + t_stat <= PUT_IDLE; // end of this block, return to IDLE + t_nzflags <= (t_nzflags << 1); + end + end + endcase + end + + +always @ (posedge clk) + case(t_stat) + PUT_IDLE : begin + if( s_en_blk ) begin + for(int i=0; i<6; i++) + for(int j=0; j<64; j++) + t_zig_blk[i][j] <= s_zig_blk[i][j]; + end + end + + PUT_TILE : begin + if (t_cnt == 4'd0) begin // DC value + end else if (t_cnt < 4'd9) begin // NOT the last cycle of a tile + for (int i=1; i<=56; i++) + t_zig_blk[0][i] <= t_zig_blk[0][i+7]; // shift AC values for 7 steps + end else begin // the last cycle of a tile + for(int i=0; i<5; i++) // switch the tiles + for(int j=0; j<64; j++) + t_zig_blk[i][j] <= t_zig_blk[i+1][j]; + end + end + endcase + + + + + +reg [ 169:0] u_bits ; // max 170 bits +reg [ 7:0] u_lens ; // 0~170 +reg u_align ; +reg u_end_seq1 ; +reg u_end_seq2 ; + +logic [ 169:0] ut_bits; // temporary variable, not real register +logic [ 7:0] ut_lens; // temporary variable, not real register + +always @ (posedge clk or negedge rstn) + if(~rstn) begin + u_bits <= '0; + u_lens <= '0; + u_align <= '0; + {u_end_seq2, u_end_seq1} <= '0; + end else begin + ut_bits = '0; + ut_lens = '0; + if (t_append_b10) begin + ut_bits = 170'b10; + ut_lens += 8'd2; + end + for (int i=6; i>=0; i--) begin + ut_bits |= ( (170)'(t_bits[i]) << ut_lens ); + ut_lens += t_lens[i]; + end + u_bits <= ut_bits; + u_lens <= ut_lens; + u_align <= t_align; + {u_end_seq2, u_end_seq1} <= {u_end_seq1, t_end_seq}; + end + + + + +reg [ 254:0] v_bits ; // max 255 bits +reg [ 7:0] v_lens ; // 0~255 + +reg [ 255:0] v_data ; +reg v_en ; +reg v_last ; + +logic [ 431:0] vt_bits; // 432 bits, temporary variable, not real register +logic [ 8:0] vt_lens; // temporary variable, not real register + +always @ (posedge clk or negedge rstn) + if(~rstn) begin + v_bits <= '0; + v_lens <= '0; + v_data <= '0; + v_en <= '0; + v_last <= '0; + end else begin + if (u_end_seq2) begin // a special case: end of sequence + v_bits <= '0; + v_lens <= '0; + v_data <= {v_bits, 1'b0}; + v_en <= 1'b1; + v_last <= 1'b1; + end else begin + vt_lens = (9)'(v_lens); + if (u_align && vt_lens[2:0] != 3'h0) begin + vt_lens[2:0] = '0; + vt_lens[8:3] ++; // align lens to a multiple of 8 bits (1 byte) + end + vt_lens += (9)'(u_lens); + vt_bits = {v_bits, 177'h0} | ( (432)'(u_bits) << (9'd432-vt_lens) ); + v_lens <= vt_lens[7:0]; + if (vt_lens[8]) begin + {v_data, v_bits} <= {vt_bits, 79'h0}; + v_en <= 1'b1; + end else begin + v_bits <= vt_bits[431:177]; + v_en <= 1'b0; + end + v_last <= 1'b0; + end + end + + + + +assign o_en = v_en; +assign o_last = v_last; +assign o_data = v_data; + + +endmodule + + + + diff --git a/xdma/ch_3/xdma_mpeg2encode.c b/xdma/ch_3/xdma_mpeg2encode.c new file mode 100644 index 0000000..acc690e --- /dev/null +++ b/xdma/ch_3/xdma_mpeg2encode.c @@ -0,0 +1,372 @@ + +#include +#include +#include +#include +#include + +#include + + + + +// function : dev_read +// description : read data from device to local memory (buffer), (i.e. device-to-host) +// parameter : +// dev_fd : device instance +// addr : source address in the device +// buffer : buffer base pointer +// size : data size +// return: +// int : 0=success, -1=failed +int dev_read (int dev_fd, uint64_t addr, void *buffer, uint64_t size) { + if ( addr != lseek(dev_fd, addr, SEEK_SET) ) // seek + return -1; // seek failed + if ( size != read(dev_fd, buffer, size) ) // read device to buffer + return -1; // read failed + return 0; +} + + +// function : dev_write +// description : write data from local memory (buffer) to device, (i.e. host-to-device) +// parameter : +// dev_fd : device instance +// addr : target address in the device +// buffer : buffer base pointer +// size : data size +// return: +// int : 0=success, -1=failed +int dev_write (int dev_fd, uint64_t addr, void *buffer, uint64_t size) { + if ( addr != lseek(dev_fd, addr, SEEK_SET) ) // seek + return -1; // seek failed + if ( size != write(dev_fd, buffer, size) ) // write device from buffer + return -1; // write failed + return 0; +} + + +// function : dev_read_uint64 +// description : read a uint64 value (8 bytes) from device to local, (i.e. device-to-host) +// parameter : +// dev_fd : device instance +// addr : source address in the device +// p_value : uint64 value's pointer +// return: +// int : 0=success, -1=failed +int dev_read_uint64 (int dev_fd, uint64_t addr, uint64_t *p_value) { + return dev_read(dev_fd, addr, (void*)p_value, sizeof(uint64_t)); +} + + +// function : dev_write_uint64 +// description : write a uint64 value (8 bytes) from to device, (i.e. host-to-device) +// parameter : +// dev_fd : device instance +// addr : target address in the device +// value : uint64 value to write +// return: +// int : 0=success, -1=failed +int dev_write_uint64 (int dev_fd, uint64_t addr, uint64_t value) { + return dev_write(dev_fd, addr, (void*)&value, sizeof(uint64_t)); +} + + + +// mpeg2encoder device's address map -------------------------------------------------------------------------------- +#define ADDR_RESET_AND_SEQUENCE_CONTROL 0x00000000UL +#define ADDR_VIDEO_FRAME_SIZE 0x00000008UL +#define ADDR_OUT_BUF_CONTROL 0x00000010UL +#define ADDR_BASE_IN_PIXELS 0x01000000UL +#define ADDR_BASE_OUT_BUF 0x01000000UL + + +// function : mpeg2encoder_reset +// description : reset the mpeg2encoder in the device +// return: +// int : 0=success, -1=failed +int mpeg2encoder_reset (int dev_w_fd) { + if ( dev_write_uint64(dev_w_fd, ADDR_RESET_AND_SEQUENCE_CONTROL, 0x0) ) // reset + return -1; + return dev_write_uint64(dev_w_fd, ADDR_RESET_AND_SEQUENCE_CONTROL, 0x1) ; // reset release +} + + +// function : mpeg2encoder_set_sequence_stop +// description : send a "sequence stop" signal to the device, which indicate this is the end of a video sequence +// return: +// int : 0=success, -1=failed +int mpeg2encoder_set_sequence_stop (int dev_w_fd) { + return dev_write_uint64(dev_w_fd, ADDR_RESET_AND_SEQUENCE_CONTROL, 0x3) ; // sequence stop, note that the lowest bit is set to 1 to keep the reset released +} + + +// function : mpeg2encoder_set_video_frame_size +// description : set the frame width and height of the video sequence to be encoded +// parameter : +// xsize16 : frame width / 16 . e.g. for width = 640, xsize16 = 40 +// ysize16 : frame height / 16 . e.g. for height = 480, ysize16 = 30 +// return: +// int : 0=success, -1=failed +int mpeg2encoder_set_video_frame_size (int dev_w_fd, int xsize16, int ysize16) { + uint64_t reg_value; + reg_value = ysize16; + reg_value <<= 32; + reg_value |= xsize16; + return dev_write_uint64(dev_w_fd, ADDR_VIDEO_FRAME_SIZE, reg_value); +} + + +// function : mpeg2encoder_put_pixels +// description : put raw YUV pixels to the device, which will be encoded to the MPEG2 stream +// return: +// int : 0=success, -1=failed +int mpeg2encoder_put_pixels (int dev_w_fd, void *buffer, uint64_t size) { + return dev_write(dev_w_fd, ADDR_BASE_IN_PIXELS, buffer, size); +} + + +// function : mpeg2encoder_get_outbuf +// description : get data (encoded MPEG2 stream) from the out buffer of the device +// return: +// int : 0=success, -1=failed +int mpeg2encoder_get_outbuf (int dev_r_fd, int dev_w_fd, void *buffer, int *p_size, int *p_overflow) { // buffer must have 0x200000 (2MB) space at least + int offset = 0; + + for (;;) { + do { // wait until outbuf not empty + uint64_t reg_value; + if ( dev_read_uint64(dev_r_fd, ADDR_OUT_BUF_CONTROL, ®_value) ) + return -1; + *p_size = reg_value & 0x7ffffff; + *p_overflow = ((reg_value >> 32) & 1); + } while (*p_size <= 0) ; + + if (offset >= *p_size) // no more available data in out buffer + break; + + if ( dev_read(dev_r_fd, ADDR_BASE_OUT_BUF+offset, (void*)((char*)buffer+offset), *p_size-offset ) ) + return -1; + + offset = *p_size; + } + + return dev_write_uint64(dev_w_fd, ADDR_OUT_BUF_CONTROL, 0UL); // clear the out buffer +} + + + + +// function : get_millisecond +// description : get time in millisecond +uint64_t get_millisecond () { + struct timeb tb; + ftime(&tb); + return (uint64_t)tb.millitm + (uint64_t)tb.time * 1000UL; + // tb.time is the number of seconds since 00:00:00 January 1, 1970 UTC time; + // tb.millitm is the number of milliseconds in a second +} + + + +// function : parse_int +// description : get a int value from string (support hexadecimal and decimal) +int parse_int (char *string, int *pvalue) { + if ( string[0] == '0' && string[1] == 'x' ) // HEX format "0xXXXXXXXX" + return sscanf( &(string[2]), "%x", pvalue); + else // DEC format + return sscanf( string , "%d", pvalue); +} + + + +char USAGE [] = + "Usage: \n" + " %s \n" + "\n" + "Where:\n" + " in_file_name : .yuv file which contains the raw YUV pixels of a video,\n" + " must be in YUYV format (a type of YUV 4:2:2 packed format)\n" + " out_file_name : .m2v file which contains the MPEG2 stream,\n" + "\n" + "Example:\n" + " %s /dev/xdma0_c2h_0 /dev/xdma0_h2c_0 in.yuv out.m2v 640 480\n" + "\n" ; + + + +// video frame size limitations (limited by the device) ----------------------------------------------------------------- +#define VIDEO_FRAME_XSIZE16_MIN 4 // min frame width = 4 * 16 = 64 +#define VIDEO_FRAME_XSIZE16_MAX 64 // max frame width = 4 * 64 = 1024 +#define VIDEO_FRAME_YSIZE16_MIN 4 // min frame height = 4 * 16 = 64 +#define VIDEO_FRAME_YSIZE16_MAX 64 // max frame height = 4 * 64 = 1024 + + + +#define IN_BUF_SIZE 0x200000 +#define OUT_BUF_SIZE 0x200000 + + + +int main (int argc, char *argv[]) { + + static char in_buffer [IN_BUF_SIZE]; // this buffer is for the RAW pixels which is to send to the device + static char out_buffer [OUT_BUF_SIZE]; // this buffer is for the MPEG2 stream get from the device + + int ret = -1; + uint64_t millisecond; + char usage_string [1024]; + + char *dev_r_name, *dev_w_name, *in_file_name, *out_file_name; + int xsize, ysize; + + int dev_r_fd = -1, dev_w_fd = -1; + + FILE *in_fp = NULL, *out_fp = NULL; + + + sprintf(usage_string, USAGE, argv[0], argv[0] ); + + if (argc < 7) { // not enough argument + puts(usage_string); + return -1; + } + + + dev_r_name = argv[1]; + dev_w_name = argv[2]; + in_file_name = argv[3]; + out_file_name = argv[4]; + + if ( 0 == parse_int(argv[5], &xsize) ) { // parse video width (xsize) from command line argument + puts(usage_string); + return -1; + } + + if ( 0 == parse_int(argv[6], &ysize) ) { // parse video height (ysize) from command line argument + puts(usage_string); + return -1; + } + + // print information ----------------------------------- + printf("device : %s %s\n" , dev_r_name, dev_w_name); + printf("input yuv file : %s\n" , in_file_name); + printf("output m2v file : %s\n" , out_file_name); + printf("video size : %d x %d\n" , xsize, ysize ); + + // check video frame size ----------------------------------- + if ( xsize % 16 != 0 ) { + printf("*** ERROR: xsize must be multiple of 16\n"); + return -1; + } + if ( ysize % 16 != 0 ) { + printf("*** ERROR: ysize must be multiple of 16\n"); + return -1; + } + if ( xsize < 16*VIDEO_FRAME_XSIZE16_MIN || xsize > 16*VIDEO_FRAME_XSIZE16_MAX ) { + printf("*** ERROR: xsize must not smaller than %d or larger than %d", 16*VIDEO_FRAME_XSIZE16_MIN, 16*VIDEO_FRAME_XSIZE16_MAX); + return -1; + } + if ( ysize < 16*VIDEO_FRAME_YSIZE16_MIN || ysize > 16*VIDEO_FRAME_YSIZE16_MAX ) { + printf("*** ERROR: ysize must not smaller than %d or larger than %d", 16*VIDEO_FRAME_YSIZE16_MIN, 16*VIDEO_FRAME_YSIZE16_MAX); + return -1; + } + + + dev_r_fd = open(dev_r_name, O_RDWR); // open device + if (dev_r_fd < 0) { + printf("*** ERROR: failed to open device %s\n", dev_r_name); + goto close_and_clear; + } + + dev_w_fd = open(dev_w_name, O_RDWR); // open device + if (dev_w_fd < 0) { + printf("*** ERROR: failed to open device %s\n", dev_w_name); + goto close_and_clear; + } + + in_fp = fopen(in_file_name, "rb"); // open file for read + if (in_fp == NULL) { + printf("*** ERROR: failed to open file %s for read\n", in_file_name); + goto close_and_clear; + } + + out_fp = fopen(out_file_name, "wb"); // open file for write + if (out_fp == NULL) { + printf("*** ERROR: failed to open file %s for write\n", out_file_name); + goto close_and_clear; + } + + + millisecond = get_millisecond(); // get start time + + if ( mpeg2encoder_reset(dev_w_fd) ) { // reset the mpeg2encoder device + printf("*** ERROR: failed to reset device\n"); + goto close_and_clear; + } + + if ( mpeg2encoder_set_video_frame_size(dev_w_fd, xsize/16, ysize/16) ) { // set video frame size + printf("*** ERROR: failed to set video frame size\n"); + goto close_and_clear; + } + + do { + int in_len, out_len, out_overflow; + + in_len = fread((void*)in_buffer, 1, IN_BUF_SIZE, in_fp); // file -> in_buffer (raw YUV pixels) + + if ( mpeg2encoder_put_pixels(dev_w_fd, (void*)in_buffer, in_len) ) { // in_buffer -> device (raw YUV pixels) + printf("*** ERROR: failed to put pixels\n"); + goto close_and_clear; + } + + if ( feof(in_fp) ) // if input file is ended, send "video sequence stop" signal + if ( mpeg2encoder_set_sequence_stop(dev_w_fd) ) { + printf("*** ERROR: failed to set sequence stop\n"); + goto close_and_clear; + } + + if ( mpeg2encoder_get_outbuf (dev_r_fd, dev_w_fd, (void*)out_buffer, &out_len, &out_overflow) ) { // device -> out_buffer (MPEG2 stream) + printf("*** ERROR: failed to get data from device\n"); + goto close_and_clear; + } + + if ( out_overflow ) + printf("*** WARNING: device's out buffer overflow, out data stream is partly corrupted\n"); + + if ( out_len != fwrite((void*)out_buffer, 1, out_len, out_fp) ) { // out_buffer -> file (MPEG2 stream) + printf("*** ERROR: failed to write %s\n", out_file_name); + goto close_and_clear; + } + + } while ( ! feof(in_fp) ) ; + + millisecond = get_millisecond() - millisecond; // get time consumption + millisecond = (millisecond > 0) ? millisecond : 1; // avoid divide-by-zero + + printf("input size = %ld\n", ftell(in_fp) ); + printf("output size = %ld\n", ftell(out_fp) ); + printf("compression ratio = %.1lf\n", (double) ftell(in_fp) / (ftell(out_fp) + 1) ); + printf("time=%lu ms input throughput = %.1lf KBps\n", millisecond, (double) ftell(in_fp) / millisecond ); + + ret = 0; + +close_and_clear: + + if (dev_r_fd >= 0) + close(dev_r_fd); + + if (dev_w_fd >= 0) + close(dev_r_fd); + + if (in_fp) + fclose(in_fp); + + if (out_fp) + fclose(out_fp); + + return ret; +} + + +