From 7b7d41199a4e3bced7292accf2c4a49baaf673d9 Mon Sep 17 00:00:00 2001
From: Daniel Ledda <ledda@cip.ifi.lmu.de>
Date: Mon, 20 Jul 2020 21:32:24 +0200
Subject: [PATCH] Initial commit

---
 README.md     |  64 ++++++++++++++++
 align         |   6 ++
 alignments.py | 204 ++++++++++++++++++++++++++++++++++++++++++++++++++
 e             |   6 ++
 f             |   6 ++
 5 files changed, 286 insertions(+)
 create mode 100644 align
 create mode 100644 alignments.py
 create mode 100644 e
 create mode 100644 f

diff --git a/README.md b/README.md
index 8b609ee..64b81a9 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,66 @@
 # koehn_alignments
 Visualisation of the results of the alignment algorithm in Philipp Koehn's SMT book.
+
+## Usage 
+
+Put the alignments, tokenised e sentences, and tokenised f sentences into the files align, e, and f, respectively (or choose your own names and enter these into the command line.)
+There are examples of the format required included.
+
+By default, each sentence is read, the alignment is printed to console on a grid, and each 
+
+```
+usage: alignments.py [-h] [-r RENDER_TYPE] [-s] [-e E_FILE] [-f F_FILE] [-a ALIGN_FILE]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -r RENDER_TYPE, --render_type RENDER_TYPE
+                        How to render the phrases. Choose from 'text', 'image', or 'both'. Default: 'both'
+  -s, --hide_alignment  Turn off the initial render of the alignment.
+  -e E_FILE, --e_file E_FILE
+                        Location of the file containing the translated sentences, separated by newlines. Default: './e'
+  -f F_FILE, --f_file F_FILE
+                        Location of the file containing the foreign sentences, separated by newlines. Default: './f'
+  -a ALIGN_FILE, --align_file ALIGN_FILE
+                        Location of the file containing translated sentences, separated by newlines. Default: './align'
+```
+
+Sample output:
+
+```
+           m                                                                  
+           i                                                              b   
+           c             d                                                l   
+           h      g      a                    d                    h      e   
+           a      e      v      a             a                    a      i   
+           e      h      o      u             s      e      i      u      b   
+           l      t      n      s      ,      s      r      m      s      t   
+        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
+michael ░░   ░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
+        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
+               ███████████████████████████████████       ░░░░░░░       ░░░░░░░
+assumes        ██   ████   ████   ████████████████       ░░░░░░░       ░░░░░░░
+               ███████████████████████████████████       ░░░░░░░       ░░░░░░░
+        ░░░░░░░███████████████████████████████████░░░░░░░       ░░░░░░░       
+   that ░░░░░░░██████████████████████████████   ██░░░░░░░       ░░░░░░░       
+        ░░░░░░░███████████████████████████████████░░░░░░░       ░░░░░░░       
+               ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░
+     he        ░░░░░░░       ░░░░░░░       ░░░░░░░  ░░░  ░░░░░░░       ░░░░░░░
+               ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░
+        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
+   will ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░  ░░░  
+        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
+               ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░
+   stay        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░   ░░
+               ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░
+        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
+     in ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░  ░░░  ░░░░░░░       
+        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
+               ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░
+    the        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░   ░░       ░░░░░░░
+               ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░
+        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
+  house ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░   ░░       
+        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
+E:  assumes that
+F:  geht davon aus , dass
+```
\ No newline at end of file
diff --git a/align b/align
new file mode 100644
index 0000000..9760d4b
--- /dev/null
+++ b/align
@@ -0,0 +1,6 @@
+0 0 1 1 1 2 1 3 2 5 3 6 4 9 5 9 6 7 7 7 8 8
+0 0 1 1 2 2 3 4 4 5 5 6 6 7 7 8 8 11 9 12 9 13 10 10 11 9 12 14 13 15 14 15 15 3 16 16
+0 0 1 2 2 3 3 4 4 1 5 2 6 5 7 6 8 7 9 8 10 9 11 10 12 11 12 12 13 13
+0 0 1 1 2 2 3 0 4 0 5 0 6 9 6 10 7 11 8 4 9 5 10 6 11 7 12 8 13 3 14 12 15 14 16 13 17 15 18 16 19 17 19 18 20 19
+0 0 1 1 2 2 3 3 4 4 5 5 6 6 6 7 7 8 8 9 8 10 8 11 9 12 10 13 11 14 12 9 12 10 12 11 13 15
+0 0 1 1 2 2 3 3 4 5 5 4 6 4 7 6 7 7 8 8 9 6 10 9
diff --git a/alignments.py b/alignments.py
new file mode 100644
index 0000000..eb8f473
--- /dev/null
+++ b/alignments.py
@@ -0,0 +1,204 @@
+from typing import List, Tuple, NamedTuple, Iterator
+from collections import OrderedDict
+import argparse
+
+Alignment = List[Tuple[int, int]]
+Sentence = List[str]
+
+
+class PhrasePair(NamedTuple):
+    e_phrase_span: Tuple[int, int]
+    f_phrase_span: Tuple[int, int]
+
+
+BOX_DRAWING = {
+    "dotted": {
+        "hili": "██   ██",
+        "grey": "░░   ░░",
+        "blnk": "  ░░░  ",
+    },
+    "filled": {
+        "hili": "███████",
+        "grey": "░░░░░░░",
+        "blnk": "       ",
+    }
+}
+CELL_WIDTH = len(BOX_DRAWING["dotted"]["hili"])
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-r",
+        "--render_type",
+        type=str,
+        default="both",
+        help="How to render the phrases. Choose from 'text', 'image', or 'both'. Default: 'both'"
+    )
+    parser.add_argument(
+        "-s",
+        "--hide_alignment",
+        action="store_true",
+        help="Turn off the initial render of the alignment."
+    )
+    parser.add_argument(
+        "-e",
+        "--e_file",
+        type=str,
+        default="./e",
+        help="Location of the file containing the translated sentences, separated by newlines. Default: './e'"
+    )
+    parser.add_argument(
+        "-f",
+        "--f_file",
+        type=str,
+        default="./f",
+        help="Location of the file containing the foreign sentences, separated by newlines. Default: './f'"
+    )
+    parser.add_argument(
+        "-a",
+        "--align_file",
+        type=str,
+        default="./align",
+        help="Location of the file containing translated sentences, separated by newlines. Default: './align'"
+    )
+    args = parser.parse_args()
+
+    for e_sent, f_sent, alignment in load_alignments(args.e_file, args.f_file, args.align_file):
+        if not args.hide_alignment:
+            print_word_grid(f_sent, e_sent, alignment)
+            input("Press any key...")
+        phrase_pairs = find_all_phrase_pairs_in_alignment(alignment, len(e_sent), len(f_sent))
+        render_phrase_pairs(
+            phrase_pairs,
+            e_sentence=e_sent,
+            f_sentence=f_sent,
+            render_type=args.render_type,
+            alignment=alignment)
+
+
+def print_word_grid(sent_x: List[str], sent_y: List[str], highlighted_cells: List[Tuple[int, int]], dotted_cells=None):
+    if dotted_cells is None:
+        dotted_cells = []
+    max_e_len = max([len(word) for word in sent_y])
+    max_f_len = max([len(word) for word in sent_x])
+    print_grid_headers(max_e_len, max_f_len, sent_x)
+    for y_ord in range(len(sent_y)):
+        for row in range(3):
+            left_column_text = sent_y[y_ord].rjust(max_e_len, " ") if row == 1 else " " * max_e_len
+            print(left_column_text, end=" ")
+            for x_ord in range(len(sent_x)):
+                print_appropriate_cell(x_ord, y_ord, highlighted_cells, dotted_cells if row == 1 else [])
+            print()
+
+
+def print_appropriate_cell(x_ord, y_ord, highlighted_cells, dotted_cells):
+    test_pair = (y_ord, x_ord)
+    if test_pair in highlighted_cells:
+        color = "hili"
+    elif (y_ord - x_ord) % 2 == 0:
+        color = "grey"
+    else:
+        color = "blnk"
+    cell_chars = BOX_DRAWING["dotted" if test_pair in dotted_cells else "filled"][color]
+    print(cell_chars, end="")
+
+
+def print_grid_headers(max_e_len, max_f_len, sent_f, width=CELL_WIDTH):
+    for char_pos in range(max_f_len, 0, -1):
+        print(" " * max_e_len, end=" ")
+        for word in sent_f:
+            try:
+                print(word[-char_pos].center(width, " "), end="")
+            except IndexError:
+                print(" " * width, end="")
+        print()
+
+
+def find_all_phrase_pairs_in_alignment(alignment: Alignment, e_len: int, f_len: int) -> OrderedDict:
+    phrase_pairs: OrderedDict = OrderedDict()
+    for e_start in range(e_len):
+        for e_end in range(e_start, e_len):
+            f_start, f_end = f_len, -1
+            for e_alignment_pos, f_alignment_pos in alignment:
+                if e_start <= e_alignment_pos <= e_end:
+                    f_start = min(f_alignment_pos, f_start)
+                    f_end = max(f_alignment_pos, f_end)
+            extracted_phrase_pairs = extract(alignment, e_start, e_end, f_start, f_end, f_len)
+            phrase_pairs.update(extracted_phrase_pairs)
+    return phrase_pairs
+
+
+def extract(alignment: Alignment, e_start: int, e_end: int, f_start: int, f_end: int, f_len: int) -> OrderedDict:
+    extracted_phrase_pairs = OrderedDict()
+    if f_end == -1:
+        return extracted_phrase_pairs
+    for e_alignment_pos, f_alignment_pos in alignment:
+        if (e_alignment_pos < e_start or e_end < e_alignment_pos) and f_start <= f_alignment_pos <= f_end:
+            return extracted_phrase_pairs
+    f_stretch_s = f_start
+    while True:
+        f_stretch_e = f_end
+        while True:
+            extracted_phrase_pairs[PhrasePair(
+                e_phrase_span=(e_start, e_end),
+                f_phrase_span=(f_stretch_s, f_stretch_e))] = None
+            f_stretch_e += 1
+            if f_in_alignment(f_stretch_e, alignment) or f_stretch_e >= f_len:
+                break
+        f_stretch_s -= 1
+        if f_in_alignment(f_stretch_s, alignment) or f_stretch_s < 0:
+            break
+    return extracted_phrase_pairs
+
+
+def f_in_alignment(sought_f_pos: int, alignment: Alignment):
+    for e_pos, f_pos in alignment:
+        if sought_f_pos == f_pos:
+            return True
+    return False
+
+
+def render_phrase_pairs(phrase_pairs: OrderedDict, e_sentence: List[str], f_sentence: List[str], render_type: str, alignment: Alignment) -> None:
+    for pair in phrase_pairs:
+        if render_type == "image" or render_type == "both":
+            print_word_grid(f_sentence, e_sentence, list_cells_in_phrase_pair(pair), alignment)
+        if render_type == "text" or render_type == "both":
+            e_span, f_span = pair
+            print("E: ", *[e_sentence[i] for i in range(e_span[0], e_span[1] + 1)])
+            print("F: ", *[f_sentence[i] for i in range(f_span[0], f_span[1] + 1)])
+            print()
+        input("Press any key...")
+
+
+def list_cells_in_phrase_pair(phrase_pair: PhrasePair) -> List[Tuple[int, int]]:
+    alignment: Alignment = []
+    for e_pos in range(phrase_pair.e_phrase_span[0], phrase_pair.e_phrase_span[1] + 1):
+        for f_pos in range(phrase_pair.f_phrase_span[0], phrase_pair.f_phrase_span[1] + 1):
+            alignment.append((e_pos, f_pos))
+    return alignment
+
+
+def load_alignments(e_file: str, f_file: str, align_file: str) -> Iterator[Tuple[Sentence, Sentence, Alignment]]:
+    for e_sent, f_sent, alignment in zip(sents_in_file(e_file), sents_in_file(f_file), alignments_in_file(align_file)):
+        yield e_sent, f_sent, alignment
+
+
+def alignments_in_file(source_file_loc: str) -> Iterator[Alignment]:
+    with open(source_file_loc, 'r') as source_file:
+        for line in source_file:
+            alignment = []
+            positions = line.strip().split()
+            for i in range(0, len(positions), 2):
+                alignment.append((int(positions[i]), int(positions[i+1])))
+            yield alignment
+
+
+def sents_in_file(source_file_loc: str) -> Iterator[List[str]]:
+    with open(source_file_loc, 'r') as source_file:
+        for line in source_file:
+            yield line.strip().split()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/e b/e
new file mode 100644
index 0000000..d9e0b4d
--- /dev/null
+++ b/e
@@ -0,0 +1,6 @@
+michael assumes that he will stay in the house
+NULL Sie dürfen der schönen Morticia , die zum Sterben schön ist , beim Tanzen zuschauen .
+NULL Mach das Fenster bitte zu , es gibt einen heftigen Durchzug gerade .
+NULL Wenn man an der Stelle im Satz das Wort " Wald " liest , stört es den schönen Lesefluss .
+NULL Wie sind Sie , Herr Schachmeister , auf diesen unorthodoxen Zug gekommen ?
+NULL Mit deiner Technik kommt gar nichts vom Teller runter .
diff --git a/f b/f
new file mode 100644
index 0000000..846c48f
--- /dev/null
+++ b/f
@@ -0,0 +1,6 @@
+michael geht davon aus , dass er im haus bleibt
+NULL You can watch the beautiful Morticia , who is beautiful to die for , dancing .
+NULL Please close the window , there is a violent draft right now .
+NULL If one reads the word " forest " in the sentence , it disturbs the beautiful reading flow .
+NULL How did you , Mr Chess Master , come to make this unorthodox move ?
+NULL With your technique nothing comes off the plate .