Initial commit

This commit is contained in:
Daniel Ledda
2020-07-20 21:32:24 +02:00
parent 6bc44dbd53
commit 7b7d41199a
5 changed files with 286 additions and 0 deletions

View File

@@ -1,2 +1,66 @@
# koehn_alignments
Visualisation of the results of the alignment algorithm in Philipp Koehn's SMT book.
## Usage
Put the alignments, tokenised e sentences, and tokenised f sentences into the files align, e, and f, respectively (or choose your own names and enter these into the command line.)
There are examples of the format required included.
By default, each sentence is read, the alignment is printed to console on a grid, and each
```
usage: alignments.py [-h] [-r RENDER_TYPE] [-s] [-e E_FILE] [-f F_FILE] [-a ALIGN_FILE]
optional arguments:
-h, --help show this help message and exit
-r RENDER_TYPE, --render_type RENDER_TYPE
How to render the phrases. Choose from 'text', 'image', or 'both'. Default: 'both'
-s, --hide_alignment Turn off the initial render of the alignment.
-e E_FILE, --e_file E_FILE
Location of the file containing the translated sentences, separated by newlines. Default: './e'
-f F_FILE, --f_file F_FILE
Location of the file containing the foreign sentences, separated by newlines. Default: './f'
-a ALIGN_FILE, --align_file ALIGN_FILE
Location of the file containing translated sentences, separated by newlines. Default: './align'
```
Sample output:
```
m
i b
c d l
h g a d h e
a e v a a a i
e h o u s e i u b
l t n s , s r m s t
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
michael ░░ ░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
███████████████████████████████████ ░░░░░░░ ░░░░░░░
assumes ██ ████ ████ ████████████████ ░░░░░░░ ░░░░░░░
███████████████████████████████████ ░░░░░░░ ░░░░░░░
░░░░░░░███████████████████████████████████░░░░░░░ ░░░░░░░
that ░░░░░░░██████████████████████████████ ██░░░░░░░ ░░░░░░░
░░░░░░░███████████████████████████████████░░░░░░░ ░░░░░░░
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
he ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░ ░░░░░░░ ░░░░░░░
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
will ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
stay ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░ ░░
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
in ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░ ░░░░░░░
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
the ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░ ░░ ░░░░░░░
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
house ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░ ░░
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
E: assumes that
F: geht davon aus , dass
```

6
align Normal file
View File

@@ -0,0 +1,6 @@
0 0 1 1 1 2 1 3 2 5 3 6 4 9 5 9 6 7 7 7 8 8
0 0 1 1 2 2 3 4 4 5 5 6 6 7 7 8 8 11 9 12 9 13 10 10 11 9 12 14 13 15 14 15 15 3 16 16
0 0 1 2 2 3 3 4 4 1 5 2 6 5 7 6 8 7 9 8 10 9 11 10 12 11 12 12 13 13
0 0 1 1 2 2 3 0 4 0 5 0 6 9 6 10 7 11 8 4 9 5 10 6 11 7 12 8 13 3 14 12 15 14 16 13 17 15 18 16 19 17 19 18 20 19
0 0 1 1 2 2 3 3 4 4 5 5 6 6 6 7 7 8 8 9 8 10 8 11 9 12 10 13 11 14 12 9 12 10 12 11 13 15
0 0 1 1 2 2 3 3 4 5 5 4 6 4 7 6 7 7 8 8 9 6 10 9

204
alignments.py Normal file
View File

@@ -0,0 +1,204 @@
from typing import List, Tuple, NamedTuple, Iterator
from collections import OrderedDict
import argparse
Alignment = List[Tuple[int, int]]
Sentence = List[str]
class PhrasePair(NamedTuple):
e_phrase_span: Tuple[int, int]
f_phrase_span: Tuple[int, int]
BOX_DRAWING = {
"dotted": {
"hili": "██ ██",
"grey": "░░ ░░",
"blnk": " ░░░ ",
},
"filled": {
"hili": "███████",
"grey": "░░░░░░░",
"blnk": " ",
}
}
CELL_WIDTH = len(BOX_DRAWING["dotted"]["hili"])
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"-r",
"--render_type",
type=str,
default="both",
help="How to render the phrases. Choose from 'text', 'image', or 'both'. Default: 'both'"
)
parser.add_argument(
"-s",
"--hide_alignment",
action="store_true",
help="Turn off the initial render of the alignment."
)
parser.add_argument(
"-e",
"--e_file",
type=str,
default="./e",
help="Location of the file containing the translated sentences, separated by newlines. Default: './e'"
)
parser.add_argument(
"-f",
"--f_file",
type=str,
default="./f",
help="Location of the file containing the foreign sentences, separated by newlines. Default: './f'"
)
parser.add_argument(
"-a",
"--align_file",
type=str,
default="./align",
help="Location of the file containing translated sentences, separated by newlines. Default: './align'"
)
args = parser.parse_args()
for e_sent, f_sent, alignment in load_alignments(args.e_file, args.f_file, args.align_file):
if not args.hide_alignment:
print_word_grid(f_sent, e_sent, alignment)
input("Press any key...")
phrase_pairs = find_all_phrase_pairs_in_alignment(alignment, len(e_sent), len(f_sent))
render_phrase_pairs(
phrase_pairs,
e_sentence=e_sent,
f_sentence=f_sent,
render_type=args.render_type,
alignment=alignment)
def print_word_grid(sent_x: List[str], sent_y: List[str], highlighted_cells: List[Tuple[int, int]], dotted_cells=None):
if dotted_cells is None:
dotted_cells = []
max_e_len = max([len(word) for word in sent_y])
max_f_len = max([len(word) for word in sent_x])
print_grid_headers(max_e_len, max_f_len, sent_x)
for y_ord in range(len(sent_y)):
for row in range(3):
left_column_text = sent_y[y_ord].rjust(max_e_len, " ") if row == 1 else " " * max_e_len
print(left_column_text, end=" ")
for x_ord in range(len(sent_x)):
print_appropriate_cell(x_ord, y_ord, highlighted_cells, dotted_cells if row == 1 else [])
print()
def print_appropriate_cell(x_ord, y_ord, highlighted_cells, dotted_cells):
test_pair = (y_ord, x_ord)
if test_pair in highlighted_cells:
color = "hili"
elif (y_ord - x_ord) % 2 == 0:
color = "grey"
else:
color = "blnk"
cell_chars = BOX_DRAWING["dotted" if test_pair in dotted_cells else "filled"][color]
print(cell_chars, end="")
def print_grid_headers(max_e_len, max_f_len, sent_f, width=CELL_WIDTH):
for char_pos in range(max_f_len, 0, -1):
print(" " * max_e_len, end=" ")
for word in sent_f:
try:
print(word[-char_pos].center(width, " "), end="")
except IndexError:
print(" " * width, end="")
print()
def find_all_phrase_pairs_in_alignment(alignment: Alignment, e_len: int, f_len: int) -> OrderedDict:
phrase_pairs: OrderedDict = OrderedDict()
for e_start in range(e_len):
for e_end in range(e_start, e_len):
f_start, f_end = f_len, -1
for e_alignment_pos, f_alignment_pos in alignment:
if e_start <= e_alignment_pos <= e_end:
f_start = min(f_alignment_pos, f_start)
f_end = max(f_alignment_pos, f_end)
extracted_phrase_pairs = extract(alignment, e_start, e_end, f_start, f_end, f_len)
phrase_pairs.update(extracted_phrase_pairs)
return phrase_pairs
def extract(alignment: Alignment, e_start: int, e_end: int, f_start: int, f_end: int, f_len: int) -> OrderedDict:
extracted_phrase_pairs = OrderedDict()
if f_end == -1:
return extracted_phrase_pairs
for e_alignment_pos, f_alignment_pos in alignment:
if (e_alignment_pos < e_start or e_end < e_alignment_pos) and f_start <= f_alignment_pos <= f_end:
return extracted_phrase_pairs
f_stretch_s = f_start
while True:
f_stretch_e = f_end
while True:
extracted_phrase_pairs[PhrasePair(
e_phrase_span=(e_start, e_end),
f_phrase_span=(f_stretch_s, f_stretch_e))] = None
f_stretch_e += 1
if f_in_alignment(f_stretch_e, alignment) or f_stretch_e >= f_len:
break
f_stretch_s -= 1
if f_in_alignment(f_stretch_s, alignment) or f_stretch_s < 0:
break
return extracted_phrase_pairs
def f_in_alignment(sought_f_pos: int, alignment: Alignment):
for e_pos, f_pos in alignment:
if sought_f_pos == f_pos:
return True
return False
def render_phrase_pairs(phrase_pairs: OrderedDict, e_sentence: List[str], f_sentence: List[str], render_type: str, alignment: Alignment) -> None:
for pair in phrase_pairs:
if render_type == "image" or render_type == "both":
print_word_grid(f_sentence, e_sentence, list_cells_in_phrase_pair(pair), alignment)
if render_type == "text" or render_type == "both":
e_span, f_span = pair
print("E: ", *[e_sentence[i] for i in range(e_span[0], e_span[1] + 1)])
print("F: ", *[f_sentence[i] for i in range(f_span[0], f_span[1] + 1)])
print()
input("Press any key...")
def list_cells_in_phrase_pair(phrase_pair: PhrasePair) -> List[Tuple[int, int]]:
alignment: Alignment = []
for e_pos in range(phrase_pair.e_phrase_span[0], phrase_pair.e_phrase_span[1] + 1):
for f_pos in range(phrase_pair.f_phrase_span[0], phrase_pair.f_phrase_span[1] + 1):
alignment.append((e_pos, f_pos))
return alignment
def load_alignments(e_file: str, f_file: str, align_file: str) -> Iterator[Tuple[Sentence, Sentence, Alignment]]:
for e_sent, f_sent, alignment in zip(sents_in_file(e_file), sents_in_file(f_file), alignments_in_file(align_file)):
yield e_sent, f_sent, alignment
def alignments_in_file(source_file_loc: str) -> Iterator[Alignment]:
with open(source_file_loc, 'r') as source_file:
for line in source_file:
alignment = []
positions = line.strip().split()
for i in range(0, len(positions), 2):
alignment.append((int(positions[i]), int(positions[i+1])))
yield alignment
def sents_in_file(source_file_loc: str) -> Iterator[List[str]]:
with open(source_file_loc, 'r') as source_file:
for line in source_file:
yield line.strip().split()
if __name__ == "__main__":
main()

6
e Normal file
View File

@@ -0,0 +1,6 @@
michael assumes that he will stay in the house
NULL Sie dürfen der schönen Morticia , die zum Sterben schön ist , beim Tanzen zuschauen .
NULL Mach das Fenster bitte zu , es gibt einen heftigen Durchzug gerade .
NULL Wenn man an der Stelle im Satz das Wort " Wald " liest , stört es den schönen Lesefluss .
NULL Wie sind Sie , Herr Schachmeister , auf diesen unorthodoxen Zug gekommen ?
NULL Mit deiner Technik kommt gar nichts vom Teller runter .

6
f Normal file
View File

@@ -0,0 +1,6 @@
michael geht davon aus , dass er im haus bleibt
NULL You can watch the beautiful Morticia , who is beautiful to die for , dancing .
NULL Please close the window , there is a violent draft right now .
NULL If one reads the word " forest " in the sentence , it disturbs the beautiful reading flow .
NULL How did you , Mr Chess Master , come to make this unorthodox move ?
NULL With your technique nothing comes off the plate .