diff --git a/Makefile b/Makefile
index d8e206b..5c76806 100644
--- a/Makefile
+++ b/Makefile
@@ -47,6 +47,39 @@ python_lint:
auto_gen:
python3 test/auto_gen/run_tests.py
+# Build Nexys A7 Examples
+NEXYS_A7_EXAMPLES := io_core_ether io_core_uart ps2_logic_analyzer video_sprite_ether video_sprite_uart
+
+.PHONY: nexys_a7 $(NEXYS_A7_EXAMPLES)
+nexys_a7: $(NEXYS_A7_EXAMPLES)
+
+$(NEXYS_A7_EXAMPLES):
+ cd examples/nexys_a7/$@; \
+ python3 -m manta gen manta.yaml src/manta.v; \
+ rm -rf obj; \
+ mkdir -p obj; \
+ $(VIVADO) -mode batch -source ../build.tcl
+
+# Build Icestick Examples
+ICESTICK_EXAMPLES := io_core
+
+.PHONY: icestick $(ICESTICK_EXAMPLES)
+icestick: $(ICESTICK_EXAMPLES)
+
+$(ICESTICK_EXAMPLES):
+ cd examples/icestick/$@; \
+ python3 -m manta gen manta.yaml manta.v; \
+ $(YOSYS) -p 'synth_ice40 -top top_level -json top_level.json' top_level.sv; \
+ $(NEXTPNR_ICE40) --hx1k --json top_level.json --pcf top_level.pcf --asc top_level.asc; \
+ $(ICEPACK) top_level.asc top_level.bin; \
+ rm -f *.json; \
+ rm -f *.asc;
+
+# Formal Verification
+formal:
+ sby -f test/formal_verification/uart_rx.sby
+ sby -f test/formal_verification/bridge_rx.sby
+
# Functional Simulation
sim: ethernet_tx_tb ethernet_rx_tb mac_tb block_memory_tb io_core_tb logic_analyzer_tb bridge_rx_tb bridge_tx_tb block_memory_tb
@@ -104,35 +137,3 @@ uart_tx_tb:
vvp sim.out
rm sim.out
-# Formal Verification
-formal:
- sby -f test/formal_verification/uart_rx.sby
- sby -f test/formal_verification/bridge_rx.sby
-
-# Build Nexys A7 Examples
-NEXYS_A7_EXAMPLES := io_core_ether io_core_uart ps2_logic_analyzer video_sprite_ether video_sprite_uart
-
-.PHONY: nexys_a7 $(NEXYS_A7_EXAMPLES)
-nexys_a7: $(NEXYS_A7_EXAMPLES)
-
-$(NEXYS_A7_EXAMPLES):
- cd examples/nexys_a7/$@; \
- python3 -m manta gen manta.yaml src/manta.v; \
- rm -rf obj; \
- mkdir -p obj; \
- $(VIVADO) -mode batch -source ../build.tcl
-
-# Build Icestick Examples
-ICESTICK_EXAMPLES := io_core
-
-.PHONY: icestick $(ICESTICK_EXAMPLES)
-icestick: $(ICESTICK_EXAMPLES)
-
-$(ICESTICK_EXAMPLES):
- cd examples/icestick/$@; \
- python3 -m manta gen manta.yaml manta.v; \
- $(YOSYS) -p 'synth_ice40 -top top_level -json top_level.json' top_level.sv; \
- $(NEXTPNR_ICE40) --hx1k --json top_level.json --pcf top_level.pcf --asc top_level.asc; \
- $(ICEPACK) top_level.asc top_level.bin; \
- rm -f *.json; \
- rm -f *.asc;
diff --git a/doc/assets/bus_architecture.png b/doc/assets/bus_architecture.png
new file mode 100644
index 0000000..4badb26
Binary files /dev/null and b/doc/assets/bus_architecture.png differ
diff --git a/doc/assets/io_core_block_diagram.png b/doc/assets/io_core_block_diagram.png
new file mode 100644
index 0000000..bc446f5
Binary files /dev/null and b/doc/assets/io_core_block_diagram.png differ
diff --git a/doc/assets/manta_architecture.png b/doc/assets/manta_architecture.png
new file mode 100644
index 0000000..d388fe2
Binary files /dev/null and b/doc/assets/manta_architecture.png differ
diff --git a/doc/assets/read_transaction.png b/doc/assets/read_transaction.png
new file mode 100644
index 0000000..113302e
Binary files /dev/null and b/doc/assets/read_transaction.png differ
diff --git a/doc/assets/uart.png b/doc/assets/uart.png
new file mode 100644
index 0000000..5b0f418
Binary files /dev/null and b/doc/assets/uart.png differ
diff --git a/doc/assets/write_transaction.png b/doc/assets/write_transaction.png
new file mode 100644
index 0000000..5bdd470
Binary files /dev/null and b/doc/assets/write_transaction.png differ
diff --git a/doc/block_memory_core.md b/doc/block_memory_core.md
index bef296d..f27e37f 100644
--- a/doc/block_memory_core.md
+++ b/doc/block_memory_core.md
@@ -110,4 +110,19 @@ In a situation like this, you might want to pause writes to your BRAM while you
### Example 2 - Neural Network Accelerator
This problem would also arise if you were making a NN accelerator, with 32-bit weights stored in a BRAM updated by the host machine. Each entry would need two write operations, and during the time between the first and second write, the entry would contain a MSB from one weight, and a LSB from another. This may not be desirable - depending on what you do with your inference results, running the network with the invalid weight might be problematic.
-If you can pause inference, then the flag-based solution with an IO core described in the prior example could work. However if you cannot pause inference, you could use a second BRAM as a cache. Run inference off one BRAM, and write new weights into another. Once all the weights have been written, assert a flag with an IO Core, and switch the BRAM that weights are obtained from. This guaruntees that the BRAM contents are always valid.
\ No newline at end of file
+If you can pause inference, then the flag-based solution with an IO core described in the prior example could work. However if you cannot pause inference, you could use a second BRAM as a cache. Run inference off one BRAM, and write new weights into another. Once all the weights have been written, assert a flag with an IO Core, and switch the BRAM that weights are obtained from. This guaruntees that the BRAM contents are always valid.
+
+\section{Block Memory Core}
+\subsection{Description}
+Block memory, also referred to as block RAM (BRAM), is a staple of FPGA designs. It consists of dedicated blocks of memory spaced throughout the FPGA die, and is very commonly used in hardware designs due to its configurability, simplicity, and bandwidth. Although each block memory primitive is made of fixed-function silicon, EDA tools allow them to be mapped to logical memories of arbitrary width and depth, combining and masking off primitives when necessary. These are exposed to the user’s logic over \textit{ports}, which contain four signals for reading and writing to the BRAM. These signals specify the address, input data, output data, and the desired operation (read/write) to the core. Most BRAM primitives include two ports, each of which may live on a separate clock domain, making them useful for clock domain crossing in addition to data storage. Each port can handle a memory operation on every clock edge, which is practically the maximum memory bandwidth possible in any digital system.
+
+Central to Manta’s design objectives is the ability to debug user logic in an intuitive and familiar manner. Practically, this means being able to interact with bits on the FPGA in whatever method they’re presented. Block memory is one such method, and their pervasive use is acknowledged by the inclusion of a Block Memory Core in Manta. This core takes a standard dual-port, dual-clock BRAM and connects one port to Manta’s internal bus, and gives the other port to the user. This means that both the host machine and the user’s logic have access to the BRAM, allowing large amounts of data to be shared between both devices.
+
+This is accomplished by architecting the Block Memory Core as shown in Figure \ref{fig_block_mem_core_arch}. Internally, the Block Memory Core consists of multiple BRAMs connected in parallel. This is done to maintain the ability to create block memory of arbitrary width and depth. Manta’s internal bus uses 16-bit data words, so if a user wishes to create a BRAM of width $N$ where $N$ is larger than 16 bits, then multiple addresses in Manta’s memory are required to contain the data at a single BRAM address. These multiple addresses are created by creating many smaller block memories, each of which stores a 16-bit slice of the $N$-bit wide data. As a result, $ceil(\frac{N}{16})$ smaller BRAMs are needed to present a BRAM of width $N$ to the user. One set of ports on these smaller BRAMs are concatenated together, which presents a $N$ bit wide BRAM to the user. The other set of ports are individually connected to Manta’s internal bus.
+
+\begin{figure}[h!]
+\centering
+\includegraphics[width=\textwidth]{block_memory_architecture.png}
+\caption[Block diagram of the Block Memory Core.]{Block diagram of the Block Memory Core. Blocks in blue are clocked on the bus clock, and blocks in orange are clocked on the user clock.}
+\label{fig_block_mem_core_arch}
+\end{figure}
\ No newline at end of file
diff --git a/doc/ethernet.md b/doc/ethernet.md
index 72b327f..0ffcaa3 100644
--- a/doc/ethernet.md
+++ b/doc/ethernet.md
@@ -1,37 +1,18 @@
-ok so the way the new packets work is:
+\section{Ethernet Interface}
+\subsection{Description}
+For situations where the onboard UART is not available, Manta provides a 100Mbps Ethernet link for communicating between the host machine and target FPGA. This link implements a L2 MAC on the FPGA, designed to be directly connected to a host machine on a dedicated network adapter. The MAC is controlled by a bridge interface, which performs the exact same function as it does on the UART interface. Incoming packets are parsed into bus transactions, placed on the bus, and any response data is encapsulated into another packet sent to the host.
-- everything uses the same ethertype - that's configured once, in manta.yaml, and is set as a parameter in each of the rx and tx stacks
+This is done by interacting with an Ethernet PHY, an onboard transceiver IC that converts between the FPGA's logic-level signaling and the voltages on the cable's twisted pairs. The communication between the Ethernet PHY and the FPGA is done over an interface that's dependent on the speed of the PHY. The 10/100 Mbps interface used on the Nexys A7-100T uses the RMII as defined in IEEE 802.3u. RMII is the second-oldest member in the Media Independent Interface family, with newer revisions of 802.3 supporting faster interfaces.
-- we do [addr] [data] for incoming write messages, and [addr] for incoming read messages.
-- we do [data] for outgoing read responses. this means that:
- - we need to detect packet length on mac_rx
- - packets coming out of the FPGA are fixed-width, mac_tx will always send out 2 bytes of data
- - packets going into the FPGA are guarunteed to be longer than packets coming out of the FPGA
+Manta's bus clock must be equivalent to the PHY's reference clock if Ethernet is to be used - in the case of the 100Mbps RMII PHY on the Nexys A7 used in 6.205, this is 50MHz. This doesn't pose a problem for user logic, which is connected through Manta's cores that perform CDC internally. It does mean that a reference clock for the PHY has to be synthesized outside of Manta itself, and the means by which this is done varies by FPGA vendor and toolchain.
-- actually this doesn't make a lot of sense - we're going to be padding anyway, so this really just introduces extra complexity for us. let's just do
- something like [rw] [addr] [data]
- - since we know that we're _always_ going to get in at least 60 bytes of content and each message only contains like
- - we could say that in the future since we're using a fixed ethertype and can detect the paket length based on the crsdv line, we could concevably
- stack a bunch of [rw] [addr] [data] things together in the same packet - and creep right up to the ethernet MTU. but we'll file that along the 'other stuff'
- and go from there. for now let's just pull 1 + 2 + 2 = 5 bytes = 40 bits into aggregate and see what happens.
+This MAC allows for the usage of packets with the structure shown in Figure \ref{ethernet_packet_structure}. The bus transaction being communicated is placed at the beginning of the packet's payload field, which IEEE 802.3 allows to vary in length from 46 to 1500 bytes. The 46-byte lower limit requires 41 bytes of zero padding to be added to the five bytes used to specify a bus transaction, and only one bus transactions is specified in each Ethernet frame. This abundance of unused space results in all packets being the same length, whether the packet contains a read request, write request, or read response. Packets containing write requests elicit no response from the FPGA, just as write requests delivered over UART produce no response. The justification for this behavior is shared between the Ethernet and UART interfaces, and is provided in Section \ref{uart_justification}.
- - ok so then updated mac_rx is:
- - ether, with the reset removed from it
- - bitorder, with the reset removed from it
- - firewall, but checks the destination MAC of the packet in addition to the ethertype
- - transaction, which turns the packets coming in into rw/addr/data triplets. this is then outputted to the top level of mac_rx
+\begin{figure}[h]
+\centering
+\includegraphics[width=\textwidth]{ethernet_packet.png}
+\caption{Structure of the Ethernet packets exchanged between the host and FPGA.}
+\label{ethernet_packet_structure}
+\end{figure}
- - and the updated mac_tx is:
- - just the same, except we just put the busficiation logic inside it. so then instead of having start we do the logic with rw_i and valid_i ourselves,
- and buffer thee data ourselves
-
- - so then we just have mac_tx and mac_rx in the manta core chain. which feels good.
-
-
-previous ideas:
- - how to do variable length detection? right now our current stack is not well suited for that
- - keeping in line with the existing stack, we want to progressively take out chunks as time goes on.
- - i think we should modify firewall to check ethertype in addition to mac address also get rid of the reset while we're at it
- - because it's jaycode, probably going to be easier to rewrite from scratch to preserve style and sanity. i don't have anything to prove
- - we can use the 205 checkers for this, ironcially enough
- - i think we should modify aggregate to get both the payload and length. the payload is clocked in dibit-by-dibit, so we'll want to grab the
\ No newline at end of file
+These packets are addressed directly to the host's MAC address, which is obtained during code autogeneration. These packets also use a fixed Ethertype of \texttt{0x88B5}, which is specially reserved for ``public use and for prototype and vendor-specific protocol development'' in IEEE 802.1. This was done to create an Ethernet II frame instead of a legacy 802.3 frame, without having to implement a higher level protocol like TCP or UDP to safely use a fixed Ethertype. This allows the MAC to use modern Ethernet II frames safely, but save FPGA resources.
\ No newline at end of file
diff --git a/doc/how_it_works.md b/doc/how_it_works.md
new file mode 100644
index 0000000..a01630e
--- /dev/null
+++ b/doc/how_it_works.md
@@ -0,0 +1,111 @@
+
+## Overview
+
+To use Manta, you'll need a host machine with a FPGA development board connected to it over UART or Ethernet. The whole system looks like the following:
+
+
+
+Manta is operated via its Python API, which communicates with the connected FPGA over an interface API like `pySerial` or `Scapy`. These abstract away the OS device drivers, which function differently depending on the host machine's platform. The OS device drivers ultimately send out bytes to the FPGA, across either a USB or Ethernet cable.
+
+Once sent across the wire, bytes are picked up by an interface transciever on the FPGA development board. This is either a USB-UART converter or a RMII PHY depending on if you're using UART or Ethernet. This chip is connected to the FPGA's IO, which routes the signals to the Verilog module generated by Manta. This module parses incoming messages, passes them down a set of daisy-chained cores, and then packetizes it and sends it back to the host. These cores also connect to your logic, and are operated to help you debug your logic. The procedure for this is described below:
+
+## Usage
+Using Manta consists of the following steps:
+
+- Specifying a set of debug cores you wish to include in your design. This is done with a configuration file, formatted as either JSON or YAML.
+- Invoking Manta to generate Verilog from the configuration provided. This is done with `manta gen`, which produces a single file containing a definition for a Verilog module named `manta`.
+- Instantiating `manta` in your design, and connecting it to the logic you'd like to debug. You'll also need to connect it to your FPGA's serial transciever if you're using UART, or it's RMII PHY if you're using Ethernet. This permits communication with the host machine.
+- Building and uploading the design to your FPGA using your preferred toolchain.
+- Operating the debug core(s) through either the Python API, or the command line. The functions availble to each core are described in their documentation.
+
+An example configuration file is provided below:
+
+```yaml
+---
+cores:
+ my_io_core:
+ type: io
+
+ inputs:
+ probe_0_in: 6
+ probe_1_in: 12
+
+ outputs:
+ probe_2_out: 20
+ probe_3_out: 1
+
+ my_logic_analyzer:
+ type: logic_analyzer
+ sample_depth: 4096
+ trigger_loc: 1000
+
+ probes:
+ larry: 1
+ curly: 3
+ moe: 9
+
+ triggers:
+ - moe RISING
+ - curly FALLING
+
+uart:
+ port: "auto"
+ baudrate: 3000000
+ clock_freq: 100000000
+```
+
+This will create a Manta instance with an IO Core and a Logic Analyzer, each containing a number of probes at variable widths. The Manta module itself is provided a 100MHz clock, and communicates with the host over UART running at 3Mbaud.
+
+## System Architecture
+The logic Manta places on the FPGA consists of a series of cores connected in a chain along a common bus. Each core provides a unique method for interacting with the user’s logic, which it connects to by routing signals, called `probes`, between the user’s logic and the cores that interface with it.
+
+
+
+These probes are presented as addressable memory, and are be controlled by reading and writing to their corresponing memory - not unlike registers on a microcontroller. Each core is allotted a section of address space at compile time, and operations addressed to a core’s address space control the behavior of the core. These cores are then daisy-chained along an internal bus, which permits a chain arbitrarily many cores to be placed on the bus.
+
+At the beginning of this chain is a module called a _receive bridge_, which converts incoming UART/Ethernet communication from the host into read and write requests, which are placed on the bus. These are called _bus transactions_, and once placed on the bus, they travel through each core before reaching the _transmit bridge_ at the end of the chain. This module places the result of the bus transaction back on the UART/Ethernet interface, and sends it back to the host. This produces a request-response style of communication between the host machine and the FPGA.
+
+
+## Data Bus
+
+The data bus is designed for simplicity, and consists of five signals used to perform reads and writes on memory:
+
+- `addr [15:0]`, indicating the memory address targeted by the current transaction.
+- `data [15:0]`, which data is read from during a read, or written to during a write.
+- `rw`, indicating a read or write transaction if the signal is low or high respectively.
+- `valid`, which is driven high only when the operation specified by the other signals is to be executed.
+
+Each core has a bus input and output port, so that cores can be daisy-chained together. When it receives an incoming bus transaction (signalled by `valid`), the core checks the address on the wire against its own memory space. If the address lies within the core, the core will perform the requested operation against its own memory space. In the case of a read, it places the data at that address on `data`, and in the case of a write, it copies the value of `data` to the specified location in memory. However, if the address lies outside of the memory of the core, then no operations are performed.
+
+In all cases, the transaction is passed from the input port to the output port, regardless of if it . An example of a read and write transcation are shown below:
+
+
+
+
+
+## Message Format
+
+Ethernet and UART both allow a stream of bytes to be sent between the host and FPGA, but since they're just interfaces, they don't define how these bytes are structured. As a result, Manta implements its own messaging format, with the following structure:
+
+
+
+
+
+Each of these messages is a string of ASCII characters consisting of a preamble, optional address and data fields, and an End of Line (EOL). The preamble denotes the type of operation, _R_ for a read and _W_ for a write. The address and data fields are encoded as hexadecimal digits, represented with the characters 0-9 and A-F in ASCII. As a result, four characters are needed to encode a 16-bit address or 16-bits of data. If the message specifies a write request, then it will contain a data field after the address field. Both request types will conclude with an End of Line, which consists of the two ASCII characters indicating a Carriage Return (CR) and a Line Feed (LF).
+
+These requests are sent by the host machine to the FPGA, which reads them from the `rx` line on the interface transceiver. This is handled by the receive bridge, which parses incoming messages, and generates bus transactions from them. Once this transaction runs through every core in the chain, it arrives at the transmit bridge, which may send a response back to the host over the `tx` line.
+
+If the request specified a read operation, then a response will be produced. These responses have the same structure as the read request itself, albeit with the data read from memory substituted in place of the address. This results in a message of the same length, just with the address swapped for data. If the request specified a write operation, then no response will be sent back to the host. Manta provides no report of if the write was successful - if a write operation must be verified, Manta will just perform a read operation on the same address location and check the value.
+
+An example of some bus traffic is provided below:
+
+| Sequence Number | Interface Activity | Operation |
+|-----------------|---------------------------------|-------------------------|
+| 1 | Host → FPGA: R1234(CR)(LF) | - |
+| 2 | FPGA → Host: R5678(CR)(LF) | Read 0x5678 from 0x1234 |
+| 3 | Host → FPGA: WF00DBEEF(CR)(LF) | Write 0xBEEF to 0xF00D |
+| 4 | Host → FPGA: RF00D(CR)(LF) | - |
+| 5 | FPGA → Host: RBEEF(CR)(LF) | Read 0xBEEF from 0xF00D |
+| 6 | Host → FPGA: W12340000(CR)(LF) | Write 0x0000 to 0x1234 |
+
+When UART is used, these bytes are transmitted directly across the wire, but when Ethernet is used, they're packed into the packet's payload field.
\ No newline at end of file
diff --git a/doc/installation.md b/doc/installation.md
index 505daf3..189552e 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -1,3 +1,14 @@
+## Dependencies
+Manta requires the following dependencies:
+
+- pyYAML, which is used for parsing configuration files written in YAML.
+- pySerial, used for communicating with the FPGA over UART.
+- Scapy, used for communicating with FPGA over Ethernet.
+- pyVCD, used for writing waveforms captured by the Logic Analyzer Core to standard Value Change Dump (VCD) files.
+
+All of these dependencies are technically optional. If you're comfortable writing configuration files in JSON, then you don't need pyYAML. If you're using UART exclusively in your project, then you won't need Scapy. That said, Manta will try to install (or use an existing copy of) pyYAML, pySerial, and pyVCD during its own installation to cover all use cases.
+
+## Installation
You can install the latest version of Manta directly from source with:
```
diff --git a/doc/io_core.md b/doc/io_core.md
index d0646e5..4769a1f 100644
--- a/doc/io_core.md
+++ b/doc/io_core.md
@@ -1,20 +1,51 @@
-# IO Core
+
+## Overview
+Registers are a fundamental building block of digital hardware. Registers store values as they move throughout the FPGA, and are operated on by the logic placed onboard the chip. Interfacing with this logic in an intuitive manner is Manta’s primary design objective, and as a result it includes an Input/Output (IO) core to directly measure and control arbitrary signals on the FPGA. This is done by routing them to registers, which are then exposed to the host over Manta’s internal bus.
+
+
+
+This is done with the architecture shown in Figure \ref{io_core_block_diagram}. A series of connections are made to the user’s logic. These are called \textit{probes}, and each may be either an input or an output. If the probe is an input, then its value is taken from the user’s logic, and stored in a register that may be read by the host machine. If the probe is an output, then its value is provided to the user’s logic from a register written to by the host. The widths of these probes is arbitrary, and is set by the user at compile-time.
+
+However, the connection between these probes and the user’s logic is not direct. The state of each probe is buffered, and the buffers are updated when a \textit{strobe} register within the IO core is set by the host machine. During this update, new values for output probes are provided to user logic, and new values for input probes are read from user logic.
+
+This is done to mitigate the possibility of an inconsistent system state. Although users may configure registers of arbitrary width, Manta’s internal bus uses 16-bit data words, meaning operations on probes larger than 16 bits require multiple bus transactions. These transactions occur over some number of clock cycles, with an arbitrary amount of time between each.
+
+This can easily cause data corruption if the signals were unbuffered. For instance, a read operation on an input probe would read 16 bits at a time, but the probe’s value may change in the time that passes between transactions. This would cause the host to read a value for which each 16 bit chunk corresponds to a different moment in time. Taken together, these chunks may represent a value that the input probe never had. Similar corruption would occur when writing to an unbuffered output probe. The value of the output probe would take multiple intermediate values as each 16-bit section is written by the host. During this time the value of the output probe is not equal to either the incoming value from the host, or the value the host had previously written to it. The user logic connected to the output probe has no idea of this, and will dutifully use whatever value it is provided. This can very easily induce undesired behavior in the user’s logic, as it is being provided inputs that the user did not specify.
+
+Buffering the probes mitigates these issues, but slightly modifies the way the host machine uses the core. When the host wishes to read from an input probe, it will set and then clear the strobe register, which pulls the current value of the probe into the buffer. The host then reads from buffer, which is guaranteed to not change as it is being read from. Writing to an output probe is done in much the same way. The host writes a new value to the buffer, which is flushed out to the user’s logic when the strobe register is set and cleared. This updates every bit in the output probe all at once, guaranteeing the user logic does not observe any intermediate values.
+
+These buffers also provide a convenient location to perform clock domain crossing. Each buffer is essentially a two flip-flop synchronizer, which allows the IO core to interact with user logic on a different clock than Manta’s internal bus.
+
+
+
+
+
+% \begin{figure}[h!]
+% \centering
+% \includegraphics[width=0.8\textwidth]{io_core_memory_map}
+% \caption{Memory map of an IO core.}
+% \label{io_core_memory_map}
+% \end{figure}
!!! warning "This isn't magic!"
- While the IO Core has been designed to be as fast as possible,
- setting and querying registers is nowhere near instantaneous!
+ While the IO Core has been designed to be as fast as possible,
+ setting and querying registers is nowhere near instantaneous!
If you're trying to set values in your design with cycle-accurate
timing, this will not do that for you.
-## Configuration
+## Options
+- `inputs`
+- `outputs`
+
+## Example Configuration
```yaml
---
the_muppets_io_core:
- type: io
-
+ type: io
+
inputs:
kermit: 3
piggy: 68
@@ -22,8 +53,8 @@ the_muppets_io_core:
scooter: 4
outputs:
- fozzy: 2
- gonzo: 3
+ fozzy: 1
+ gonzo: 3
uart:
baudrate: 115200
@@ -31,29 +62,18 @@ uart:
```
## Python API
-_More details to follow here as this gets written out, for now this is just a sketch_
-
-This emulates the look and feel of an IO pin, much like what you'd find on a microcontroller.
-
-Manta provides a Python API to control these - which allows for behavior like:
-
-```python
->>> import manta.api
->>> cores = manta.api.generate('manta.yaml')
->>> io = cores.my_io_core
->>> io.probe0.set(True)
->>> io.probe0.set(False)
->>> io.probe1.read()
-True
-```
The caveat being that Manta is limited by the bandwidth of PySerial, which is limited by your operating system and system hardware. These calls may take significant time to complete, and __they are blocking__. More details can be found in the API reference.
+### Example
-## How does it work?
+```python
+>>> import Manta
+>>> m = Manta('manta.yaml')
+>>> m.my_io_core.fozzy.set(True)
+>>> m.my_io_core.gonzo.set(4)
+>>> m.my_io_core.scooter.get()
+5
+```
-Each probe set in the config file maps to a single address on the bus. However the address points to a data register that's only 16-bits wide, so for probe widths larger than that we map to multiple registers.
-
-This gets a little weird. Normally each register would update as soon as it receives a new value from the interface, but doing so here would cause parts of the probe to update at different times. This is usually not desireable, so instead a copy of the probe state is maintained inside the IO core. In the case of an output core, this copy gets modified whenever write operations are performed on that section of memory, but _only on a write operation to the last register_ does this buffer get clocked out to the rest of the FPGA. Similiarly for an input core, we maintain a local copy, but that gets clocked in __on a read operation from the first register__. This makes sure that the operation remains atomic with respect to the whole port, not just every 16 bits of it.
-
-This does mean that a write operation to the last register/a read operation from the first register will cause data to be clocked out of/into the core. This allows for some flexibility in how packets are sent to the device.
\ No newline at end of file
+## How It Works
diff --git a/doc/logic_analyzer_core.md b/doc/logic_analyzer_core.md
index 6c6c5a7..21499e6 100644
--- a/doc/logic_analyzer_core.md
+++ b/doc/logic_analyzer_core.md
@@ -101,4 +101,83 @@ manta playback manta.yaml my_logic_analyzer sim/playback.v
Generates a Verilog wrapper at `sim/playback.v`, which can then be instantiated in the testbench in which it is needed. An example instantiation is provided at the top of the output verilog, so a simple copy-paste into the testbench is all that's necessary to use the module. This module is also fully synthesizable, so you can use it in designs that live on the FPGA too, if so you so wish.
-## Examples
\ No newline at end of file
+## Examples
+
+
+## from thesis
+
+\section{Logic Analyzer Core}
+\subsection{Description}
+\label{logic_analyzer_core_description}
+Central to Manta's design is the ability to debug logic in a manner intuitive and familiar to 6.205 students. As such, Manta includes a logic analyzer tool that allows them to inspect their logic through a waveform display, similar to how it might be inspected through simulation. A typical workflow for using the core consists of the following:
+
+\begin{itemize}
+ \item The user describes the signals they would like to probe in the configuration file. The user provides a list of probe names and widths, which are needed to generate suitable Verilog.
+ \item The user describes the \textit{trigger conditions} that must be met inside the FPGA fabric for a capture to begin. Triggers are defined as simple logical operations on probes, for instance checking if a probe named \texttt{foo} is equal to the number $3$, or if a probe named \texttt{bar} has just transitioned from high to low. The user also specifies the number of samples to be captured, referred to as the \textit{sample depth} of the core.
+ \item Once fully configured, a Manta module is generated and flashed to the target FPGA with the process described in \ref{usage}.
+ \item Once flashed, the user initiates the ILA from the host machine. This causes the Logic Analyzer Core to start sampling its inputs, waiting for the trigger condition to be met.
+ \item Once met, the core begins saving the values of the probes to an internal block RAM called the \textit{sample memory}. This occurs every clock cycle until a number of samples equal to the sample depth has been captured, and the sample memory is full.
+ \item Once complete, the host machine reads out the sample memory and stores it internally. This is then exported as a VCD file for use in a waveform viewer like GTKWave.
+\end{itemize}
+
+\begin{figure}[h!]
+\centering
+\includegraphics[width=\textwidth]{gtkwave.png}
+\caption{A logic analyzer capture displayed in GTKWave.}
+\label{gtkwave}
+\end{figure}
+
+This workflow is very similar to the behavior of the Xilinx ILA or a benchtop logic analyzer. This is intentional. FPGA engineers are familiar with on-chip logic analyzers, and electrical engineers are familiar with external logic analyzers. Very little is intended to be different, although a few extra features deserve mention:
+
+\subsection{Features}
+\subsubsection{Trigger Modes}
+The behavior described in \ref{logic_analyzer_core_description} is referred to as single-shot trigger mode. This means that once the trigger condition is met, data is captured on every clock cycle in a continuous single shot. This is useful and the preferred behavior for most cases, but Manta also supports \textit{Incremental} and \textit{Immediate} trigger modes.
+
+In Incremental mode, samples are only recorded to sample memory \textit{when} the trigger condition is met, not \textit{once} it is met. This allows slower-moving behavior to be captured. For instance, digital audio signals on a FPGA commonly use a 44.1kHz sampling frequency, but are routed through FPGA fabric clocked at hundreds of megahertz. As a result, many thousands of clock cycles may go by before a new audio sample is processed by the FPGA - filling the sample memory of a traditional logic analyzer with redundant data in the meantime. Placing Manta's Logic Analyzer into incremental mode solves this, as audio samples will only be saved to the sample memory when they change, assuming the trigger is configured correctly. In this case, the amount of memory required on the FPGA to capture a fixed number of audio samples is reduced by a thousandfold.
+
+In Immediate mode, the trigger condition is ignored. The core begins filling the sample memory as soon as it is enabled, stopping only once the sample memory is filled. This allows the user to inspect the current state of their probes without a trigger condition. This is especially useful for investigating cases where a trigger condition is never being met, such as latchup or deadlock conditions. This mode is also useful for obtaining a random snapshot of the FGPA's state. The core is enabled by an interface (UART, Ethernet) that is slow relative to the clock speed of the FPGA fabric, meaning that the capture occurs at an effectively random time. Successive captures of this nature can be used to determine the ``average" state of onboard logic - what information is ``usually" on a bus, or what state a module is ``typically" in.
+
+\subsubsection{Configurable Trigger Location}
+
+In the scenario described in \ref{logic_analyzer_core_description}, the sample memory is written to as soon as the trigger condition is met - and not before. This only records the probe values after the trigger, but knowing the state of the FPGA immediately before is also rather useful. To do this, the core can be configured to buffer the last few clock cycles before the trigger condition. During this time the sample memory is used as a FIFO, and once the trigger condition occurs, samples are acquired until the sample memory is filled. The number of cycles to record ahead of the trigger is called the \textit{trigger position}. By default, most logic analyzers place the trigger condition in the middle of the acquisition such that there is equal amounts of data from before and after the trigger condition. To feel as intuitive and familiar as possible, Manta defaults to the same. However, this can be changed by writing to a register in the logic analyzer core.
+
+\begin{figure}[h!]
+\centering
+\includegraphics[width=\textwidth]{trigger_positions.png}
+\caption{Regions captured by the Logic Analyzer Core as trigger position is varied.}
+\label{trigger_location_fig}
+\end{figure}
+
+\subsubsection{Simulator Playback}
+Manta also allows data captured from the Logic Analyzer core to be ``played back'' in simulation. Any obtained capture data can be exported as a \texttt{.mem} file, which can be used in most simulators via the \texttt{readmemh} and \texttt{readmemb} Verilog functions. Manta autogenerates a convenient Verilog wrapper for this, allowing users to simulate logic with signals directly measured from the real world. This is useful for verifying that a testbench is providing the proper inputs to logic under test. This is useful for a few scenarios:
+
+\begin{itemize}
+ \item \textit{Input Verification.} This targets the common student experience in 6.205 of designs working in simulation, but failing in hardware. In the absence of any build errors, this usually means that the inputs being applied to the logic in simulation don't accurately represent those being applied to the logic in the real world. \footnote{Sometimes the toolchain will step in and modify the logic specified by the user. For example, if a net is driven by two nets at the same time, Vivado will connect the net to ground, and raise a critical warning. In this case, a valid bitstream is still generated, but it doesn't configure the FGPA in a way that will match simulation.} Playing signals back in simulation allows for easy comparison between simulated and measured input, and the state of the logic downstream.
+
+ \item \textit{Sparse Sampling.} When users are debugging, their fundamental concern is the state of their logic. Normally this is obtained by sampling every net of interest with a logic analyzer probe, but for designs with a large amount of internal state sampling many signals requires significant block memory and lots of time to set up. If the design has fewer inputs than state variables, it requires fewer resources to sample the states and simulate the logic than to directly sample the state. For instance, debugging a misbehaving branch predictor in a CPU can be done by recording its address and data busses, playing them back in simulation, and inspecting the branch predictor there. This frees the user from having to sample the entire pattern history table, which would consume significant block memory.
+\end{itemize}
+
+\subsubsection{Reprogrammable Triggers}
+Manta's triggers are reprogrammable, such that rebuilding source code is not necessary to change the trigger condition. Each of the logic analyzer's input probes has a trigger assigned to it, which continuously evaluates some combinational function on the input. This logic can be programmed to check for rising edges, falling edges or any change at all. It can also be programmed to check the result of a logical operation (such as $>$, $\leq$, $=$, $\neq$, etc.) against an \textit{argument}. The operation and argument for each probe's trigger are set with a pair of registers in Manta's memory.
+
+The output of each of the individual triggers is then combined to trigger the logic analyzer core as a whole. These are combined with a $N$-input logic gate (either AND or OR) specified by the user through another register in memory. As a result the entire trigger configuration is specified by the state of Manta's memory, and changes to the configuration require resetting registers, not resynthesizing bitstreams.
+
+However, this greatly restricts the trigger conditions users can specify. To mitigate this, Manta provides an option for an external trigger that allows for more complex triggers. When enabled, Manta adds an input port to the \texttt{manta} Verilog module, and triggers off its value, rather than the internal comparators. This allows users to provide their own Verilog to produce the desired trigger condition.
+
+\subsection{Architecture}
+The Logic Analyzer Core's implementation on the FPGA consists of three primary components:
+
+\begin{itemize}
+ \item The \textit{Finite State Machine (FSM)}, which controls the operation of the core. The FSM's operation is driven by its associated registers, which are placed in a separate module. This permits simple CDC between the bus and user clock domains.
+
+ \item The \textit{Trigger Block}, which generates the core's trigger condition. The trigger block contains a trigger for each input probe, and the registers necessary to configure them. It also contains the $N$-logic gate (either AND or OR) that generates the core's trigger from the individual probe triggers. CDC is performed in exactly the same manner as the FSM. If an external trigger is specified, the trigger block is omitted from the Logic Analyzer Core, and the external trigger is routed to the FSM's \texttt{trig} input.
+
+ \item The \textit{Sample Memory}, which stores the states of the probes during a capture. This is implemented as a dual-port, dual-clock block memory, with the bus on one port and the probes on the other. The probe-connected port only writes to the memory, with the address and enable pins managed by the FSM. CDC is performed in the block RAM primitive itself.
+\end{itemize}
+
+\begin{figure}[h!]
+\centering
+\includegraphics[width=\textwidth]{manta_logic_analyzer_architecture.png}
+\caption[Block diagram of the Logic Analyzer Core.]{Block diagram of the Logic Analyzer Core. Blocks in blue are clocked on the bus clock, and blocks in orange are clocked on the user clock.}
+\label{manta_logic_analyzer_architecture_fig}
+\end{figure}
diff --git a/doc/tools_used.md b/doc/repository_structure.md
similarity index 77%
rename from doc/tools_used.md
rename to doc/repository_structure.md
index 3ec5719..97767f2 100644
--- a/doc/tools_used.md
+++ b/doc/repository_structure.md
@@ -1,5 +1,4 @@
-## What Goes Where?
-
+## Repository Structure
- `src/manta/` contains the Python and Verilog source needed to generate and run the cores.
- `test/` contains testbenchs for HDL. Manta is written in Verilog 2001, but the testbenches are written in SystemVerilog 2012. These are simulated using Icarus Verilog, which produces `.vcd` files, viewable your favorite waveform viewer, like GTKWave.
- `doc/` contains the documentation you're reading right now! It's built into a nice static site by Material for MkDocs, which automatically rebuilds the site on every commit to `main`. This is done with a GitHub Action configured in `.github/`
@@ -7,7 +6,7 @@
- `.github/` also contains some GitHub Actions configuration for automatically running the SystemVerilog testbenches and building the examples, in addition to automatically rebuilding the site.
## Tools Used
-- Verilator is used for linting
-- Wavedrom for waveform diagrams, and draw.io for block diagrams
-- GNU make for making it more convenient to run simulations.
-- GitHub Pages serves the documentation site.
\ No newline at end of file
+- Icarus Verilog is used for functional simulation.
+- The Project Icestorm tools and Vivado are used for building bitstreams.
+- Wavedrom is used for for waveform diagrams, and draw.io for block diagrams
+- GitHub Pages is used to serve the documentation site.
\ No newline at end of file
diff --git a/doc/system_architecture.md b/doc/system_architecture.md
deleted file mode 100644
index f081598..0000000
--- a/doc/system_architecture.md
+++ /dev/null
@@ -1,177 +0,0 @@
-
-# How it Works
-Manta works by having a set of configurable cores daisy-chained together across a simple bus that resembles AXI-lite. Each core exposes some region of addressible memory, which is accessed by the host machine over an interface of choice. Here's what this looks like as a block diagram, in this case UART is used as the interface:
-
-## Bus
-
-This daisy-chaining is done to make place-and-route as easy as possible - the critical timing path only exists between adjacent cores, instead of rouing back to some central core in a hub-and-spoke arrangement. This relaxed routing helps designs that span multiple clock domains and require BRAMs placed on the edges of clock domains for CDC.
-
-## Memory
-
-The memory is built of 16-bit registers living on a 16-bit address bus. Address space is assigned when the Verilog is generated, since each core can occupy a varying amount of address space depending on how it's configured. This space is assigned sequentially - the first core in the chain will occupy the first section of memory, and the last core will occupy the last section. Some registers are read-only to the host machine, and attempts to write to them will be ignored by the core.
-
-## Read/Write Transactions
-
-As you'd expect, reading from some address will elicit a response from the FGPA. However, writing to some address __will not__. If you want to verify that the data you wrote to some location is valid, read from it after the write. This is done to keep state machines simple and interfaces fast.
-
-Data moves between the host computer and the FPGA over UART. UART's just an interface though, so the choice of what data to send is arbitrary. Manta encodes data exchanged between devices as messages, which are ASCII text in the following format:
-
-```[preamble] [address] [data (optional)] [EOL]```
-
-- The __preamble__ is just the character `M`, encoded as ASCII.
-
-- The __address__ is the memory location we wish to access. This must exist somewhere in the address space consumed by the cores. If it does not, then read/write operations addressed here will do nothing. The address itself is transmitted as hex values, encoded as ASCII using the characters `0-9` and `A-F`.
-
-- The __data__ gets stored in the memory location provided by __address__. The presence of any number of data bytes indicates a write operation, while no data bytes indicates a read operation.
-
-- An __EOL__ indicates the end of the message. CR, LF, or both are considered valid delimiters to for messages sent to the FPGA. For messages sent to the host machine, the FPGA will send CRLF.
-
-This message format can be either a sequence of bytes encoded over UART, or characters in a data field of an Ethernet packet.
-
-### Example Messages
-
-Some examples of valid messages to the FPGA are:
-```MBEEF\r\n```, which writes `0xEF` to the memory at location `0xBE`.
-```MBE\r\n```, which reads the value of the memory at location `0xBE`.
-
-Some examples of invalid messages to the FPGA are:
-```MBEEEF\r\n```f, which contains 12 bits of data, which isn't a multiple of 8.
-```NBEEF\r\n```, which contains the wrong preamble.
-
-For example, `M1234\r\n` specifies a read operation at address `0x1234` in the memory, and if that location contains the data `0x5678`, it will produce a response of `M5678\r\n`.
-
-## Python API
-
-The Python API has two main purposes: to generate the Verilog required to instantiate debug cores on the FPGA, and to let the user easily interact with said cores. The exact Verilog and memory operations are dependent on the cores being configured and the interface between the host machine and the FPGA. This information is stored in a YAML (or JSON) configuration file, which is used to configure an instance of the `Manta` class. This maintains instances of `IOCore`, `LogicAnalyzerCore`, `LUTMemoryCore`, and `BRAMCore` according to the given configuration.
-
-### Loading configuration
-
-Let's use the following configuration as an example:
-
-```yaml
-
----
-cores:
- my_io_core:
- type: io
-
- inputs:
- btnc: 1
- sw: 16
-
- outputs:
- led: 16
- led16_b: 1
- led16_g: 1
- led16_r: 1
-
- my_logic_analyzer:
- type: logic_analyzer
- sample_depth: 4096
-
- probes:
- larry: 1
- curly: 1
- moe: 1
- shemp: 4
-
- triggers:
- - larry && curly && ~moe
-
- my_lut_mem:
- type: lut_mem
- size: 64
-
-uart:
- port: "/dev/tty.usbserial-2102926963071"
- baudrate: 115200
- clock_freq: 100000000
-```
-
-For each core in the config file, an instance of the corresponding Python object is added to the `Manta` object. For instance, the `Manta` instance created by the configuration above will include an `IOCore`, a `LogicAnalyzerCore`, and a `LUTMemoryCore`. Each Core object is instantiated by providing the appropriate section of the config file - for instance, the logic analyzer in the config above will be created by calling `LogicAnalyzerCore(foo)`, where `foo` is:
-
-```yaml
-my_logic_analyzer:
- type: logic_analyzer
- sample_depth: 4096
-
- probes:
- larry: 1
- curly: 1
- moe: 1
- shemp: 4
-
- triggers:
- - larry && curly && ~moe
-```
-Stored as pythonic key-value representation. Each core also checks to make sure it's been given a sensible configuration when it is instantiated - this means the class constructors are mostly assertions about the configuration.
-
-### Generating HDL
-
-Once all the cores have been instantiated and stored in the `Manta` instance, Verilog can be generated. Just like how verifying each core's configuration is left up to core's corresponding Python object, generating the HDL is also left up to each core's corresponding Python object. All that's required is for each core to implement three methods:
-
-- `hdl_inst`, which returns the module instantiation in Verilog as a python string. Any ports that need to connect to modules upstream or downstream on the bus aren't configured by the core. Those connections are made in `Manta.generate_hdl()`, which calls `Manta.generate_insts()`.
-- `hdl_def`, which returns the module definition in Verilog as a Python string. This is usually either generated on-the-fly, or loaded from the Verilog source files included in the Python wheel via `pkgutil`.
-- `hdl_top_level_ports`, which returns a list of any ports that the core needs tied to the top-level declaration of the `manta` module. Usually these are probes going to Logic Analyzers or IO Cores, or the TX/RX lines needed by a UART interface.
-
-Once these have been obtained for each core, the `Manta.generate_hdl()` method will patch them all together to produce `manta.v`, which is a single file that contains all the Verilog needed to instantiate Manta. This file has the following anatomy:
-
-- Asking each core to generate HDL instantiations, definitions, and top_level ports.
-- These then get assembled into the following parts of the file:
- - __Header__ - contains a little blurb about when and who generated the file
- - __Top-Level Module__ - the actual definition of module manta
- - __Declaration__ - contains `module manta` and top-level ports
- that constitutent cores need access to
- - __Interface RX__ - the modules needed to bring whatever interface the user
- selected onto the bus. For UART, this is just an instance
- of uart_rx and bridge_rx.
- - __Core Chain__ - the chain of cores specified by the user. This follows
- a sequence of:
- - Core Instance - HDL specifying an instance of the core.
- - Core Connection - HDL specifying the registers that connect one
- core to the next.
- - Core Instance
- - Core Connection
- ....
-
- This repeats for however many cores the user specified.
-
- - __Interface TX__ - the modules needed to bring the bus out to whatever
- interface the user selected. For UART, this is just
- an instance of bridge_tx and uart_tx.
- - __Footer__ - just the 'endmodule' keyword.
-
- - __Module Definitions__ - all the source for the modules instantiated in the
- top-level module.
-
-
-### Using Cores
-
-Once manta's been generated, included in your project, and built, the Python API provides methods for interfacing with the cores.
-
-## Cores
-
-### Logic Analyzer Core
-
-### Block Diagram
-
-