Initial commit

2020-07-20 21:32:24 +02:00
parent 6bc44dbd53
commit 7b7d41199a
5 changed files with 286 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -1,2 +1,66 @@
 # koehn_alignments
 Visualisation of the results of the alignment algorithm in Philipp Koehn's SMT book.
 ## Usage 
 Put the alignments, tokenised e sentences, and tokenised f sentences into the files align, e, and f, respectively (or choose your own names and enter these into the command line.)
 There are examples of the format required included.
 By default, each sentence is read, the alignment is printed to console on a grid, and each 
 ```
 usage: alignments.py [-h] [-r RENDER_TYPE] [-s] [-e E_FILE] [-f F_FILE] [-a ALIGN_FILE]
 optional arguments:
  -h, --help            show this help message and exit
  -r RENDER_TYPE, --render_type RENDER_TYPE
                        How to render the phrases. Choose from 'text', 'image', or 'both'. Default: 'both'
  -s, --hide_alignment  Turn off the initial render of the alignment.
  -e E_FILE, --e_file E_FILE
                        Location of the file containing the translated sentences, separated by newlines. Default: './e'
  -f F_FILE, --f_file F_FILE
                        Location of the file containing the foreign sentences, separated by newlines. Default: './f'
  -a ALIGN_FILE, --align_file ALIGN_FILE
                        Location of the file containing translated sentences, separated by newlines. Default: './align'
 ```
 Sample output:
 ```
           m                                                                  
           i                                                              b   
           c             d                                                l   
           h      g      a                    d                    h      e   
           a      e      v      a             a                    a      i   
           e      h      o      u             s      e      i      u      b   
           l      t      n      s      ,      s      r      m      s      t   
        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
 michael ░░   ░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
               ███████████████████████████████████       ░░░░░░░       ░░░░░░░
 assumes        ██   ████   ████   ████████████████       ░░░░░░░       ░░░░░░░
               ███████████████████████████████████       ░░░░░░░       ░░░░░░░
        ░░░░░░░███████████████████████████████████░░░░░░░       ░░░░░░░       
   that ░░░░░░░██████████████████████████████   ██░░░░░░░       ░░░░░░░       
        ░░░░░░░███████████████████████████████████░░░░░░░       ░░░░░░░       
               ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░
     he        ░░░░░░░       ░░░░░░░       ░░░░░░░  ░░░  ░░░░░░░       ░░░░░░░
               ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░
        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
   will ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░  ░░░  
        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
               ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░
   stay        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░   ░░
               ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░
        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
     in ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░  ░░░  ░░░░░░░       
        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
               ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░
    the        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░   ░░       ░░░░░░░
               ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░
        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
  house ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░   ░░       
        ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       ░░░░░░░       
 E:  assumes that
 F:  geht davon aus , dass
 ```
--- a/6
+++ b/6
@@ -0,0 +1,6 @@
 0 0 1 1 1 2 1 3 2 5 3 6 4 9 5 9 6 7 7 7 8 8
 0 0 1 1 2 2 3 4 4 5 5 6 6 7 7 8 8 11 9 12 9 13 10 10 11 9 12 14 13 15 14 15 15 3 16 16
 0 0 1 2 2 3 3 4 4 1 5 2 6 5 7 6 8 7 9 8 10 9 11 10 12 11 12 12 13 13
 0 0 1 1 2 2 3 0 4 0 5 0 6 9 6 10 7 11 8 4 9 5 10 6 11 7 12 8 13 3 14 12 15 14 16 13 17 15 18 16 19 17 19 18 20 19
 0 0 1 1 2 2 3 3 4 4 5 5 6 6 6 7 7 8 8 9 8 10 8 11 9 12 10 13 11 14 12 9 12 10 12 11 13 15
 0 0 1 1 2 2 3 3 4 5 5 4 6 4 7 6 7 7 8 8 9 6 10 9
--- a/alignments.py
+++ b/alignments.py
@@ -0,0 +1,204 @@
 from typing import List, Tuple, NamedTuple, Iterator
 from collections import OrderedDict
 import argparse
 Alignment = List[Tuple[int, int]]
 Sentence = List[str]
 class PhrasePair(NamedTuple):
    e_phrase_span: Tuple[int, int]
    f_phrase_span: Tuple[int, int]
 BOX_DRAWING = {
    "dotted": {
        "hili": "██   ██",
        "grey": "░░   ░░",
        "blnk": "  ░░░  ",
    },
    "filled": {
        "hili": "███████",
        "grey": "░░░░░░░",
        "blnk": "       ",
    }
 }
 CELL_WIDTH = len(BOX_DRAWING["dotted"]["hili"])
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-r",
        "--render_type",
        type=str,
        default="both",
        help="How to render the phrases. Choose from 'text', 'image', or 'both'. Default: 'both'"
    )
    parser.add_argument(
        "-s",
        "--hide_alignment",
        action="store_true",
        help="Turn off the initial render of the alignment."
    )
    parser.add_argument(
        "-e",
        "--e_file",
        type=str,
        default="./e",
        help="Location of the file containing the translated sentences, separated by newlines. Default: './e'"
    )
    parser.add_argument(
        "-f",
        "--f_file",
        type=str,
        default="./f",
        help="Location of the file containing the foreign sentences, separated by newlines. Default: './f'"
    )
    parser.add_argument(
        "-a",
        "--align_file",
        type=str,
        default="./align",
        help="Location of the file containing translated sentences, separated by newlines. Default: './align'"
    )
    args = parser.parse_args()
    for e_sent, f_sent, alignment in load_alignments(args.e_file, args.f_file, args.align_file):
        if not args.hide_alignment:
            print_word_grid(f_sent, e_sent, alignment)
            input("Press any key...")
        phrase_pairs = find_all_phrase_pairs_in_alignment(alignment, len(e_sent), len(f_sent))
        render_phrase_pairs(
            phrase_pairs,
            e_sentence=e_sent,
            f_sentence=f_sent,
            render_type=args.render_type,
            alignment=alignment)
 def print_word_grid(sent_x: List[str], sent_y: List[str], highlighted_cells: List[Tuple[int, int]], dotted_cells=None):
    if dotted_cells is None:
        dotted_cells = []
    max_e_len = max([len(word) for word in sent_y])
    max_f_len = max([len(word) for word in sent_x])
    print_grid_headers(max_e_len, max_f_len, sent_x)
    for y_ord in range(len(sent_y)):
        for row in range(3):
            left_column_text = sent_y[y_ord].rjust(max_e_len, " ") if row == 1 else " " * max_e_len
            print(left_column_text, end=" ")
            for x_ord in range(len(sent_x)):
                print_appropriate_cell(x_ord, y_ord, highlighted_cells, dotted_cells if row == 1 else [])
            print()
 def print_appropriate_cell(x_ord, y_ord, highlighted_cells, dotted_cells):
    test_pair = (y_ord, x_ord)
    if test_pair in highlighted_cells:
        color = "hili"
    elif (y_ord - x_ord) % 2 == 0:
        color = "grey"
    else:
        color = "blnk"
    cell_chars = BOX_DRAWING["dotted" if test_pair in dotted_cells else "filled"][color]
    print(cell_chars, end="")
 def print_grid_headers(max_e_len, max_f_len, sent_f, width=CELL_WIDTH):
    for char_pos in range(max_f_len, 0, -1):
        print(" " * max_e_len, end=" ")
        for word in sent_f:
            try:
                print(word[-char_pos].center(width, " "), end="")
            except IndexError:
                print(" " * width, end="")
        print()
 def find_all_phrase_pairs_in_alignment(alignment: Alignment, e_len: int, f_len: int) -> OrderedDict:
    phrase_pairs: OrderedDict = OrderedDict()
    for e_start in range(e_len):
        for e_end in range(e_start, e_len):
            f_start, f_end = f_len, -1
            for e_alignment_pos, f_alignment_pos in alignment:
                if e_start <= e_alignment_pos <= e_end:
                    f_start = min(f_alignment_pos, f_start)
                    f_end = max(f_alignment_pos, f_end)
            extracted_phrase_pairs = extract(alignment, e_start, e_end, f_start, f_end, f_len)
            phrase_pairs.update(extracted_phrase_pairs)
    return phrase_pairs
 def extract(alignment: Alignment, e_start: int, e_end: int, f_start: int, f_end: int, f_len: int) -> OrderedDict:
    extracted_phrase_pairs = OrderedDict()
    if f_end == -1:
        return extracted_phrase_pairs
    for e_alignment_pos, f_alignment_pos in alignment:
        if (e_alignment_pos < e_start or e_end < e_alignment_pos) and f_start <= f_alignment_pos <= f_end:
            return extracted_phrase_pairs
    f_stretch_s = f_start
    while True:
        f_stretch_e = f_end
        while True:
            extracted_phrase_pairs[PhrasePair(
                e_phrase_span=(e_start, e_end),
                f_phrase_span=(f_stretch_s, f_stretch_e))] = None
            f_stretch_e += 1
            if f_in_alignment(f_stretch_e, alignment) or f_stretch_e >= f_len:
                break
        f_stretch_s -= 1
        if f_in_alignment(f_stretch_s, alignment) or f_stretch_s < 0:
            break
    return extracted_phrase_pairs
 def f_in_alignment(sought_f_pos: int, alignment: Alignment):
    for e_pos, f_pos in alignment:
        if sought_f_pos == f_pos:
            return True
    return False
 def render_phrase_pairs(phrase_pairs: OrderedDict, e_sentence: List[str], f_sentence: List[str], render_type: str, alignment: Alignment) -> None:
    for pair in phrase_pairs:
        if render_type == "image" or render_type == "both":
            print_word_grid(f_sentence, e_sentence, list_cells_in_phrase_pair(pair), alignment)
        if render_type == "text" or render_type == "both":
            e_span, f_span = pair
            print("E: ", *[e_sentence[i] for i in range(e_span[0], e_span[1] + 1)])
            print("F: ", *[f_sentence[i] for i in range(f_span[0], f_span[1] + 1)])
            print()
        input("Press any key...")
 def list_cells_in_phrase_pair(phrase_pair: PhrasePair) -> List[Tuple[int, int]]:
    alignment: Alignment = []
    for e_pos in range(phrase_pair.e_phrase_span[0], phrase_pair.e_phrase_span[1] + 1):
        for f_pos in range(phrase_pair.f_phrase_span[0], phrase_pair.f_phrase_span[1] + 1):
            alignment.append((e_pos, f_pos))
    return alignment
 def load_alignments(e_file: str, f_file: str, align_file: str) -> Iterator[Tuple[Sentence, Sentence, Alignment]]:
    for e_sent, f_sent, alignment in zip(sents_in_file(e_file), sents_in_file(f_file), alignments_in_file(align_file)):
        yield e_sent, f_sent, alignment
 def alignments_in_file(source_file_loc: str) -> Iterator[Alignment]:
    with open(source_file_loc, 'r') as source_file:
        for line in source_file:
            alignment = []
            positions = line.strip().split()
            for i in range(0, len(positions), 2):
                alignment.append((int(positions[i]), int(positions[i+1])))
            yield alignment
 def sents_in_file(source_file_loc: str) -> Iterator[List[str]]:
    with open(source_file_loc, 'r') as source_file:
        for line in source_file:
            yield line.strip().split()
 if __name__ == "__main__":
    main()
--- a/6
+++ b/6
@@ -0,0 +1,6 @@
 michael assumes that he will stay in the house
 NULL Sie dürfen der schönen Morticia , die zum Sterben schön ist , beim Tanzen zuschauen .
 NULL Mach das Fenster bitte zu , es gibt einen heftigen Durchzug gerade .
 NULL Wenn man an der Stelle im Satz das Wort " Wald " liest , stört es den schönen Lesefluss .
 NULL Wie sind Sie , Herr Schachmeister , auf diesen unorthodoxen Zug gekommen ?
 NULL Mit deiner Technik kommt gar nichts vom Teller runter .
--- a/6
+++ b/6
@@ -0,0 +1,6 @@
 michael geht davon aus , dass er im haus bleibt
 NULL You can watch the beautiful Morticia , who is beautiful to die for , dancing .
 NULL Please close the window , there is a violent draft right now .
 NULL If one reads the word " forest " in the sentence , it disturbs the beautiful reading flow .
 NULL How did you , Mr Chess Master , come to make this unorthodox move ?
 NULL With your technique nothing comes off the plate .