From 7b7d41199a4e3bced7292accf2c4a49baaf673d9 Mon Sep 17 00:00:00 2001 From: Daniel Ledda Date: Mon, 20 Jul 2020 21:32:24 +0200 Subject: [PATCH] Initial commit --- README.md | 64 ++++++++++++++++ align | 6 ++ alignments.py | 204 ++++++++++++++++++++++++++++++++++++++++++++++++++ e | 6 ++ f | 6 ++ 5 files changed, 286 insertions(+) create mode 100644 align create mode 100644 alignments.py create mode 100644 e create mode 100644 f diff --git a/README.md b/README.md index 8b609ee..64b81a9 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,66 @@ # koehn_alignments Visualisation of the results of the alignment algorithm in Philipp Koehn's SMT book. + +## Usage + +Put the alignments, tokenised e sentences, and tokenised f sentences into the files align, e, and f, respectively (or choose your own names and enter these into the command line.) +There are examples of the format required included. + +By default, each sentence is read, the alignment is printed to console on a grid, and each + +``` +usage: alignments.py [-h] [-r RENDER_TYPE] [-s] [-e E_FILE] [-f F_FILE] [-a ALIGN_FILE] + +optional arguments: + -h, --help show this help message and exit + -r RENDER_TYPE, --render_type RENDER_TYPE + How to render the phrases. Choose from 'text', 'image', or 'both'. Default: 'both' + -s, --hide_alignment Turn off the initial render of the alignment. + -e E_FILE, --e_file E_FILE + Location of the file containing the translated sentences, separated by newlines. Default: './e' + -f F_FILE, --f_file F_FILE + Location of the file containing the foreign sentences, separated by newlines. Default: './f' + -a ALIGN_FILE, --align_file ALIGN_FILE + Location of the file containing translated sentences, separated by newlines. Default: './align' +``` + +Sample output: + +``` + m + i b + c d l + h g a d h e + a e v a a a i + e h o u s e i u b + l t n s , s r m s t + ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ +michael ░░ ░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ + ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ + ███████████████████████████████████ ░░░░░░░ ░░░░░░░ +assumes ██ ████ ████ ████████████████ ░░░░░░░ ░░░░░░░ + ███████████████████████████████████ ░░░░░░░ ░░░░░░░ + ░░░░░░░███████████████████████████████████░░░░░░░ ░░░░░░░ + that ░░░░░░░██████████████████████████████ ██░░░░░░░ ░░░░░░░ + ░░░░░░░███████████████████████████████████░░░░░░░ ░░░░░░░ + ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ + he ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░ ░░░░░░░ ░░░░░░░ + ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ + ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ + will ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░ + ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ + ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ + stay ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░ ░░ + ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ + ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ + in ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░ ░░░░░░░ + ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ + ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ + the ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░ ░░ ░░░░░░░ + ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ + ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ + house ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░ ░░ + ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ +E: assumes that +F: geht davon aus , dass +``` \ No newline at end of file diff --git a/align b/align new file mode 100644 index 0000000..9760d4b --- /dev/null +++ b/align @@ -0,0 +1,6 @@ +0 0 1 1 1 2 1 3 2 5 3 6 4 9 5 9 6 7 7 7 8 8 +0 0 1 1 2 2 3 4 4 5 5 6 6 7 7 8 8 11 9 12 9 13 10 10 11 9 12 14 13 15 14 15 15 3 16 16 +0 0 1 2 2 3 3 4 4 1 5 2 6 5 7 6 8 7 9 8 10 9 11 10 12 11 12 12 13 13 +0 0 1 1 2 2 3 0 4 0 5 0 6 9 6 10 7 11 8 4 9 5 10 6 11 7 12 8 13 3 14 12 15 14 16 13 17 15 18 16 19 17 19 18 20 19 +0 0 1 1 2 2 3 3 4 4 5 5 6 6 6 7 7 8 8 9 8 10 8 11 9 12 10 13 11 14 12 9 12 10 12 11 13 15 +0 0 1 1 2 2 3 3 4 5 5 4 6 4 7 6 7 7 8 8 9 6 10 9 diff --git a/alignments.py b/alignments.py new file mode 100644 index 0000000..eb8f473 --- /dev/null +++ b/alignments.py @@ -0,0 +1,204 @@ +from typing import List, Tuple, NamedTuple, Iterator +from collections import OrderedDict +import argparse + +Alignment = List[Tuple[int, int]] +Sentence = List[str] + + +class PhrasePair(NamedTuple): + e_phrase_span: Tuple[int, int] + f_phrase_span: Tuple[int, int] + + +BOX_DRAWING = { + "dotted": { + "hili": "██ ██", + "grey": "░░ ░░", + "blnk": " ░░░ ", + }, + "filled": { + "hili": "███████", + "grey": "░░░░░░░", + "blnk": " ", + } +} +CELL_WIDTH = len(BOX_DRAWING["dotted"]["hili"]) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-r", + "--render_type", + type=str, + default="both", + help="How to render the phrases. Choose from 'text', 'image', or 'both'. Default: 'both'" + ) + parser.add_argument( + "-s", + "--hide_alignment", + action="store_true", + help="Turn off the initial render of the alignment." + ) + parser.add_argument( + "-e", + "--e_file", + type=str, + default="./e", + help="Location of the file containing the translated sentences, separated by newlines. Default: './e'" + ) + parser.add_argument( + "-f", + "--f_file", + type=str, + default="./f", + help="Location of the file containing the foreign sentences, separated by newlines. Default: './f'" + ) + parser.add_argument( + "-a", + "--align_file", + type=str, + default="./align", + help="Location of the file containing translated sentences, separated by newlines. Default: './align'" + ) + args = parser.parse_args() + + for e_sent, f_sent, alignment in load_alignments(args.e_file, args.f_file, args.align_file): + if not args.hide_alignment: + print_word_grid(f_sent, e_sent, alignment) + input("Press any key...") + phrase_pairs = find_all_phrase_pairs_in_alignment(alignment, len(e_sent), len(f_sent)) + render_phrase_pairs( + phrase_pairs, + e_sentence=e_sent, + f_sentence=f_sent, + render_type=args.render_type, + alignment=alignment) + + +def print_word_grid(sent_x: List[str], sent_y: List[str], highlighted_cells: List[Tuple[int, int]], dotted_cells=None): + if dotted_cells is None: + dotted_cells = [] + max_e_len = max([len(word) for word in sent_y]) + max_f_len = max([len(word) for word in sent_x]) + print_grid_headers(max_e_len, max_f_len, sent_x) + for y_ord in range(len(sent_y)): + for row in range(3): + left_column_text = sent_y[y_ord].rjust(max_e_len, " ") if row == 1 else " " * max_e_len + print(left_column_text, end=" ") + for x_ord in range(len(sent_x)): + print_appropriate_cell(x_ord, y_ord, highlighted_cells, dotted_cells if row == 1 else []) + print() + + +def print_appropriate_cell(x_ord, y_ord, highlighted_cells, dotted_cells): + test_pair = (y_ord, x_ord) + if test_pair in highlighted_cells: + color = "hili" + elif (y_ord - x_ord) % 2 == 0: + color = "grey" + else: + color = "blnk" + cell_chars = BOX_DRAWING["dotted" if test_pair in dotted_cells else "filled"][color] + print(cell_chars, end="") + + +def print_grid_headers(max_e_len, max_f_len, sent_f, width=CELL_WIDTH): + for char_pos in range(max_f_len, 0, -1): + print(" " * max_e_len, end=" ") + for word in sent_f: + try: + print(word[-char_pos].center(width, " "), end="") + except IndexError: + print(" " * width, end="") + print() + + +def find_all_phrase_pairs_in_alignment(alignment: Alignment, e_len: int, f_len: int) -> OrderedDict: + phrase_pairs: OrderedDict = OrderedDict() + for e_start in range(e_len): + for e_end in range(e_start, e_len): + f_start, f_end = f_len, -1 + for e_alignment_pos, f_alignment_pos in alignment: + if e_start <= e_alignment_pos <= e_end: + f_start = min(f_alignment_pos, f_start) + f_end = max(f_alignment_pos, f_end) + extracted_phrase_pairs = extract(alignment, e_start, e_end, f_start, f_end, f_len) + phrase_pairs.update(extracted_phrase_pairs) + return phrase_pairs + + +def extract(alignment: Alignment, e_start: int, e_end: int, f_start: int, f_end: int, f_len: int) -> OrderedDict: + extracted_phrase_pairs = OrderedDict() + if f_end == -1: + return extracted_phrase_pairs + for e_alignment_pos, f_alignment_pos in alignment: + if (e_alignment_pos < e_start or e_end < e_alignment_pos) and f_start <= f_alignment_pos <= f_end: + return extracted_phrase_pairs + f_stretch_s = f_start + while True: + f_stretch_e = f_end + while True: + extracted_phrase_pairs[PhrasePair( + e_phrase_span=(e_start, e_end), + f_phrase_span=(f_stretch_s, f_stretch_e))] = None + f_stretch_e += 1 + if f_in_alignment(f_stretch_e, alignment) or f_stretch_e >= f_len: + break + f_stretch_s -= 1 + if f_in_alignment(f_stretch_s, alignment) or f_stretch_s < 0: + break + return extracted_phrase_pairs + + +def f_in_alignment(sought_f_pos: int, alignment: Alignment): + for e_pos, f_pos in alignment: + if sought_f_pos == f_pos: + return True + return False + + +def render_phrase_pairs(phrase_pairs: OrderedDict, e_sentence: List[str], f_sentence: List[str], render_type: str, alignment: Alignment) -> None: + for pair in phrase_pairs: + if render_type == "image" or render_type == "both": + print_word_grid(f_sentence, e_sentence, list_cells_in_phrase_pair(pair), alignment) + if render_type == "text" or render_type == "both": + e_span, f_span = pair + print("E: ", *[e_sentence[i] for i in range(e_span[0], e_span[1] + 1)]) + print("F: ", *[f_sentence[i] for i in range(f_span[0], f_span[1] + 1)]) + print() + input("Press any key...") + + +def list_cells_in_phrase_pair(phrase_pair: PhrasePair) -> List[Tuple[int, int]]: + alignment: Alignment = [] + for e_pos in range(phrase_pair.e_phrase_span[0], phrase_pair.e_phrase_span[1] + 1): + for f_pos in range(phrase_pair.f_phrase_span[0], phrase_pair.f_phrase_span[1] + 1): + alignment.append((e_pos, f_pos)) + return alignment + + +def load_alignments(e_file: str, f_file: str, align_file: str) -> Iterator[Tuple[Sentence, Sentence, Alignment]]: + for e_sent, f_sent, alignment in zip(sents_in_file(e_file), sents_in_file(f_file), alignments_in_file(align_file)): + yield e_sent, f_sent, alignment + + +def alignments_in_file(source_file_loc: str) -> Iterator[Alignment]: + with open(source_file_loc, 'r') as source_file: + for line in source_file: + alignment = [] + positions = line.strip().split() + for i in range(0, len(positions), 2): + alignment.append((int(positions[i]), int(positions[i+1]))) + yield alignment + + +def sents_in_file(source_file_loc: str) -> Iterator[List[str]]: + with open(source_file_loc, 'r') as source_file: + for line in source_file: + yield line.strip().split() + + +if __name__ == "__main__": + main() diff --git a/e b/e new file mode 100644 index 0000000..d9e0b4d --- /dev/null +++ b/e @@ -0,0 +1,6 @@ +michael assumes that he will stay in the house +NULL Sie dürfen der schönen Morticia , die zum Sterben schön ist , beim Tanzen zuschauen . +NULL Mach das Fenster bitte zu , es gibt einen heftigen Durchzug gerade . +NULL Wenn man an der Stelle im Satz das Wort " Wald " liest , stört es den schönen Lesefluss . +NULL Wie sind Sie , Herr Schachmeister , auf diesen unorthodoxen Zug gekommen ? +NULL Mit deiner Technik kommt gar nichts vom Teller runter . diff --git a/f b/f new file mode 100644 index 0000000..846c48f --- /dev/null +++ b/f @@ -0,0 +1,6 @@ +michael geht davon aus , dass er im haus bleibt +NULL You can watch the beautiful Morticia , who is beautiful to die for , dancing . +NULL Please close the window , there is a violent draft right now . +NULL If one reads the word " forest " in the sentence , it disturbs the beautiful reading flow . +NULL How did you , Mr Chess Master , come to make this unorthodox move ? +NULL With your technique nothing comes off the plate .