Initial commit
This commit is contained in:
64
README.md
64
README.md
@@ -1,2 +1,66 @@
|
|||||||
# koehn_alignments
|
# koehn_alignments
|
||||||
Visualisation of the results of the alignment algorithm in Philipp Koehn's SMT book.
|
Visualisation of the results of the alignment algorithm in Philipp Koehn's SMT book.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Put the alignments, tokenised e sentences, and tokenised f sentences into the files align, e, and f, respectively (or choose your own names and enter these into the command line.)
|
||||||
|
There are examples of the format required included.
|
||||||
|
|
||||||
|
By default, each sentence is read, the alignment is printed to console on a grid, and each
|
||||||
|
|
||||||
|
```
|
||||||
|
usage: alignments.py [-h] [-r RENDER_TYPE] [-s] [-e E_FILE] [-f F_FILE] [-a ALIGN_FILE]
|
||||||
|
|
||||||
|
optional arguments:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
-r RENDER_TYPE, --render_type RENDER_TYPE
|
||||||
|
How to render the phrases. Choose from 'text', 'image', or 'both'. Default: 'both'
|
||||||
|
-s, --hide_alignment Turn off the initial render of the alignment.
|
||||||
|
-e E_FILE, --e_file E_FILE
|
||||||
|
Location of the file containing the translated sentences, separated by newlines. Default: './e'
|
||||||
|
-f F_FILE, --f_file F_FILE
|
||||||
|
Location of the file containing the foreign sentences, separated by newlines. Default: './f'
|
||||||
|
-a ALIGN_FILE, --align_file ALIGN_FILE
|
||||||
|
Location of the file containing translated sentences, separated by newlines. Default: './align'
|
||||||
|
```
|
||||||
|
|
||||||
|
Sample output:
|
||||||
|
|
||||||
|
```
|
||||||
|
m
|
||||||
|
i b
|
||||||
|
c d l
|
||||||
|
h g a d h e
|
||||||
|
a e v a a a i
|
||||||
|
e h o u s e i u b
|
||||||
|
l t n s , s r m s t
|
||||||
|
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
michael ░░ ░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
███████████████████████████████████ ░░░░░░░ ░░░░░░░
|
||||||
|
assumes ██ ████ ████ ████████████████ ░░░░░░░ ░░░░░░░
|
||||||
|
███████████████████████████████████ ░░░░░░░ ░░░░░░░
|
||||||
|
░░░░░░░███████████████████████████████████░░░░░░░ ░░░░░░░
|
||||||
|
that ░░░░░░░██████████████████████████████ ██░░░░░░░ ░░░░░░░
|
||||||
|
░░░░░░░███████████████████████████████████░░░░░░░ ░░░░░░░
|
||||||
|
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
he ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
will ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░
|
||||||
|
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
stay ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░ ░░
|
||||||
|
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
in ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░ ░░░░░░░
|
||||||
|
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
the ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░ ░░ ░░░░░░░
|
||||||
|
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
house ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░ ░░
|
||||||
|
░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░ ░░░░░░░
|
||||||
|
E: assumes that
|
||||||
|
F: geht davon aus , dass
|
||||||
|
```
|
||||||
6
align
Normal file
6
align
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
0 0 1 1 1 2 1 3 2 5 3 6 4 9 5 9 6 7 7 7 8 8
|
||||||
|
0 0 1 1 2 2 3 4 4 5 5 6 6 7 7 8 8 11 9 12 9 13 10 10 11 9 12 14 13 15 14 15 15 3 16 16
|
||||||
|
0 0 1 2 2 3 3 4 4 1 5 2 6 5 7 6 8 7 9 8 10 9 11 10 12 11 12 12 13 13
|
||||||
|
0 0 1 1 2 2 3 0 4 0 5 0 6 9 6 10 7 11 8 4 9 5 10 6 11 7 12 8 13 3 14 12 15 14 16 13 17 15 18 16 19 17 19 18 20 19
|
||||||
|
0 0 1 1 2 2 3 3 4 4 5 5 6 6 6 7 7 8 8 9 8 10 8 11 9 12 10 13 11 14 12 9 12 10 12 11 13 15
|
||||||
|
0 0 1 1 2 2 3 3 4 5 5 4 6 4 7 6 7 7 8 8 9 6 10 9
|
||||||
204
alignments.py
Normal file
204
alignments.py
Normal file
@@ -0,0 +1,204 @@
|
|||||||
|
from typing import List, Tuple, NamedTuple, Iterator
|
||||||
|
from collections import OrderedDict
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
Alignment = List[Tuple[int, int]]
|
||||||
|
Sentence = List[str]
|
||||||
|
|
||||||
|
|
||||||
|
class PhrasePair(NamedTuple):
|
||||||
|
e_phrase_span: Tuple[int, int]
|
||||||
|
f_phrase_span: Tuple[int, int]
|
||||||
|
|
||||||
|
|
||||||
|
BOX_DRAWING = {
|
||||||
|
"dotted": {
|
||||||
|
"hili": "██ ██",
|
||||||
|
"grey": "░░ ░░",
|
||||||
|
"blnk": " ░░░ ",
|
||||||
|
},
|
||||||
|
"filled": {
|
||||||
|
"hili": "███████",
|
||||||
|
"grey": "░░░░░░░",
|
||||||
|
"blnk": " ",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
CELL_WIDTH = len(BOX_DRAWING["dotted"]["hili"])
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"-r",
|
||||||
|
"--render_type",
|
||||||
|
type=str,
|
||||||
|
default="both",
|
||||||
|
help="How to render the phrases. Choose from 'text', 'image', or 'both'. Default: 'both'"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-s",
|
||||||
|
"--hide_alignment",
|
||||||
|
action="store_true",
|
||||||
|
help="Turn off the initial render of the alignment."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-e",
|
||||||
|
"--e_file",
|
||||||
|
type=str,
|
||||||
|
default="./e",
|
||||||
|
help="Location of the file containing the translated sentences, separated by newlines. Default: './e'"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-f",
|
||||||
|
"--f_file",
|
||||||
|
type=str,
|
||||||
|
default="./f",
|
||||||
|
help="Location of the file containing the foreign sentences, separated by newlines. Default: './f'"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-a",
|
||||||
|
"--align_file",
|
||||||
|
type=str,
|
||||||
|
default="./align",
|
||||||
|
help="Location of the file containing translated sentences, separated by newlines. Default: './align'"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
for e_sent, f_sent, alignment in load_alignments(args.e_file, args.f_file, args.align_file):
|
||||||
|
if not args.hide_alignment:
|
||||||
|
print_word_grid(f_sent, e_sent, alignment)
|
||||||
|
input("Press any key...")
|
||||||
|
phrase_pairs = find_all_phrase_pairs_in_alignment(alignment, len(e_sent), len(f_sent))
|
||||||
|
render_phrase_pairs(
|
||||||
|
phrase_pairs,
|
||||||
|
e_sentence=e_sent,
|
||||||
|
f_sentence=f_sent,
|
||||||
|
render_type=args.render_type,
|
||||||
|
alignment=alignment)
|
||||||
|
|
||||||
|
|
||||||
|
def print_word_grid(sent_x: List[str], sent_y: List[str], highlighted_cells: List[Tuple[int, int]], dotted_cells=None):
|
||||||
|
if dotted_cells is None:
|
||||||
|
dotted_cells = []
|
||||||
|
max_e_len = max([len(word) for word in sent_y])
|
||||||
|
max_f_len = max([len(word) for word in sent_x])
|
||||||
|
print_grid_headers(max_e_len, max_f_len, sent_x)
|
||||||
|
for y_ord in range(len(sent_y)):
|
||||||
|
for row in range(3):
|
||||||
|
left_column_text = sent_y[y_ord].rjust(max_e_len, " ") if row == 1 else " " * max_e_len
|
||||||
|
print(left_column_text, end=" ")
|
||||||
|
for x_ord in range(len(sent_x)):
|
||||||
|
print_appropriate_cell(x_ord, y_ord, highlighted_cells, dotted_cells if row == 1 else [])
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def print_appropriate_cell(x_ord, y_ord, highlighted_cells, dotted_cells):
|
||||||
|
test_pair = (y_ord, x_ord)
|
||||||
|
if test_pair in highlighted_cells:
|
||||||
|
color = "hili"
|
||||||
|
elif (y_ord - x_ord) % 2 == 0:
|
||||||
|
color = "grey"
|
||||||
|
else:
|
||||||
|
color = "blnk"
|
||||||
|
cell_chars = BOX_DRAWING["dotted" if test_pair in dotted_cells else "filled"][color]
|
||||||
|
print(cell_chars, end="")
|
||||||
|
|
||||||
|
|
||||||
|
def print_grid_headers(max_e_len, max_f_len, sent_f, width=CELL_WIDTH):
|
||||||
|
for char_pos in range(max_f_len, 0, -1):
|
||||||
|
print(" " * max_e_len, end=" ")
|
||||||
|
for word in sent_f:
|
||||||
|
try:
|
||||||
|
print(word[-char_pos].center(width, " "), end="")
|
||||||
|
except IndexError:
|
||||||
|
print(" " * width, end="")
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def find_all_phrase_pairs_in_alignment(alignment: Alignment, e_len: int, f_len: int) -> OrderedDict:
|
||||||
|
phrase_pairs: OrderedDict = OrderedDict()
|
||||||
|
for e_start in range(e_len):
|
||||||
|
for e_end in range(e_start, e_len):
|
||||||
|
f_start, f_end = f_len, -1
|
||||||
|
for e_alignment_pos, f_alignment_pos in alignment:
|
||||||
|
if e_start <= e_alignment_pos <= e_end:
|
||||||
|
f_start = min(f_alignment_pos, f_start)
|
||||||
|
f_end = max(f_alignment_pos, f_end)
|
||||||
|
extracted_phrase_pairs = extract(alignment, e_start, e_end, f_start, f_end, f_len)
|
||||||
|
phrase_pairs.update(extracted_phrase_pairs)
|
||||||
|
return phrase_pairs
|
||||||
|
|
||||||
|
|
||||||
|
def extract(alignment: Alignment, e_start: int, e_end: int, f_start: int, f_end: int, f_len: int) -> OrderedDict:
|
||||||
|
extracted_phrase_pairs = OrderedDict()
|
||||||
|
if f_end == -1:
|
||||||
|
return extracted_phrase_pairs
|
||||||
|
for e_alignment_pos, f_alignment_pos in alignment:
|
||||||
|
if (e_alignment_pos < e_start or e_end < e_alignment_pos) and f_start <= f_alignment_pos <= f_end:
|
||||||
|
return extracted_phrase_pairs
|
||||||
|
f_stretch_s = f_start
|
||||||
|
while True:
|
||||||
|
f_stretch_e = f_end
|
||||||
|
while True:
|
||||||
|
extracted_phrase_pairs[PhrasePair(
|
||||||
|
e_phrase_span=(e_start, e_end),
|
||||||
|
f_phrase_span=(f_stretch_s, f_stretch_e))] = None
|
||||||
|
f_stretch_e += 1
|
||||||
|
if f_in_alignment(f_stretch_e, alignment) or f_stretch_e >= f_len:
|
||||||
|
break
|
||||||
|
f_stretch_s -= 1
|
||||||
|
if f_in_alignment(f_stretch_s, alignment) or f_stretch_s < 0:
|
||||||
|
break
|
||||||
|
return extracted_phrase_pairs
|
||||||
|
|
||||||
|
|
||||||
|
def f_in_alignment(sought_f_pos: int, alignment: Alignment):
|
||||||
|
for e_pos, f_pos in alignment:
|
||||||
|
if sought_f_pos == f_pos:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def render_phrase_pairs(phrase_pairs: OrderedDict, e_sentence: List[str], f_sentence: List[str], render_type: str, alignment: Alignment) -> None:
|
||||||
|
for pair in phrase_pairs:
|
||||||
|
if render_type == "image" or render_type == "both":
|
||||||
|
print_word_grid(f_sentence, e_sentence, list_cells_in_phrase_pair(pair), alignment)
|
||||||
|
if render_type == "text" or render_type == "both":
|
||||||
|
e_span, f_span = pair
|
||||||
|
print("E: ", *[e_sentence[i] for i in range(e_span[0], e_span[1] + 1)])
|
||||||
|
print("F: ", *[f_sentence[i] for i in range(f_span[0], f_span[1] + 1)])
|
||||||
|
print()
|
||||||
|
input("Press any key...")
|
||||||
|
|
||||||
|
|
||||||
|
def list_cells_in_phrase_pair(phrase_pair: PhrasePair) -> List[Tuple[int, int]]:
|
||||||
|
alignment: Alignment = []
|
||||||
|
for e_pos in range(phrase_pair.e_phrase_span[0], phrase_pair.e_phrase_span[1] + 1):
|
||||||
|
for f_pos in range(phrase_pair.f_phrase_span[0], phrase_pair.f_phrase_span[1] + 1):
|
||||||
|
alignment.append((e_pos, f_pos))
|
||||||
|
return alignment
|
||||||
|
|
||||||
|
|
||||||
|
def load_alignments(e_file: str, f_file: str, align_file: str) -> Iterator[Tuple[Sentence, Sentence, Alignment]]:
|
||||||
|
for e_sent, f_sent, alignment in zip(sents_in_file(e_file), sents_in_file(f_file), alignments_in_file(align_file)):
|
||||||
|
yield e_sent, f_sent, alignment
|
||||||
|
|
||||||
|
|
||||||
|
def alignments_in_file(source_file_loc: str) -> Iterator[Alignment]:
|
||||||
|
with open(source_file_loc, 'r') as source_file:
|
||||||
|
for line in source_file:
|
||||||
|
alignment = []
|
||||||
|
positions = line.strip().split()
|
||||||
|
for i in range(0, len(positions), 2):
|
||||||
|
alignment.append((int(positions[i]), int(positions[i+1])))
|
||||||
|
yield alignment
|
||||||
|
|
||||||
|
|
||||||
|
def sents_in_file(source_file_loc: str) -> Iterator[List[str]]:
|
||||||
|
with open(source_file_loc, 'r') as source_file:
|
||||||
|
for line in source_file:
|
||||||
|
yield line.strip().split()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
6
e
Normal file
6
e
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
michael assumes that he will stay in the house
|
||||||
|
NULL Sie dürfen der schönen Morticia , die zum Sterben schön ist , beim Tanzen zuschauen .
|
||||||
|
NULL Mach das Fenster bitte zu , es gibt einen heftigen Durchzug gerade .
|
||||||
|
NULL Wenn man an der Stelle im Satz das Wort " Wald " liest , stört es den schönen Lesefluss .
|
||||||
|
NULL Wie sind Sie , Herr Schachmeister , auf diesen unorthodoxen Zug gekommen ?
|
||||||
|
NULL Mit deiner Technik kommt gar nichts vom Teller runter .
|
||||||
6
f
Normal file
6
f
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
michael geht davon aus , dass er im haus bleibt
|
||||||
|
NULL You can watch the beautiful Morticia , who is beautiful to die for , dancing .
|
||||||
|
NULL Please close the window , there is a violent draft right now .
|
||||||
|
NULL If one reads the word " forest " in the sentence , it disturbs the beautiful reading flow .
|
||||||
|
NULL How did you , Mr Chess Master , come to make this unorthodox move ?
|
||||||
|
NULL With your technique nothing comes off the plate .
|
||||||
Reference in New Issue
Block a user