0ad/binaries/data/mods/official/language/convert.pl

use strict;
use warnings;

use Spreadsheet::ParseExcel;
use Encode;

opendir my $root, '.' or die "Error opening current directory: $!";
for my $d (grep { /^[a-z]+$/ and -d $_ } readdir $root) {
	opendir my $dir, $d or die "Error opening directory '$d': $!";
	for (grep /\.xls$/, readdir $dir) {
		/(.*)\.xls$/;
		print "* Converting $d/$1.xls\n";
		convert("$d/$1");
	}
}
print "* Completed\n\n";

sub convert {

	my $filename_in = "$_[0].xls";
	my $filename_base = $_[0];

	$_[0] =~ /([^\/]+)$/;
	my $table_name = $1;

	my $workbook = Spreadsheet::ParseExcel::Workbook->Parse($filename_in);
	die unless $workbook;
	my $worksheet = $workbook->{Worksheet}[0];

	my @data;

	my %encodings = (
		# Excel name => Encode name
		ucs2 => 'ucs2-be',
		ascii => 'ascii',
	);

	for my $r ($worksheet->{MinRow} .. $worksheet->{MaxRow}) {
		push @data, [];
		for my $c ($worksheet->{MinCol} .. $worksheet->{MaxCol}) {
			my $cell = $worksheet->{Cells}[$r][$c];
			if ($cell) {
				my $code = defined $cell->{Code} ? $cell->{Code} : 'ascii';
				if (not exists $encodings{$code}) {
					die "Unrecognised encoding '$code'";
				}
				push @{$data[-1]}, decode($encodings{$code}, $cell->{Val});
			} else {
				push @{$data[-1]}, '';
			}
		}
	}

	if (@data < 3) {
		die "Too few rows of data";
	}

	# Remove the top line
	shift @data;

	if (1 != grep length, @{$data[0]}) {
		die "Second row must contain a single cell, either 'phrases' or the name of the word-table (e.g. 'nouns')";
	}

	my $title = (shift @data)->[0];

	# Remove blank lines
	while (@data and not grep length, @{$data[0]}) {
		shift @data;
	}

	if ($title eq 'phrases') {
		convert_phrases($filename_base, \@data);
	} else {
		convert_words($filename_base, $title, \@data);
	}
}

sub convert_words {

	my $filename_out = "$_[0].wrd";
	my $title = $_[1];
	my @data = @{$_[2]};

	my @keys = map lc, @{ shift @data };
	# Remove the English name from the list of keys
	shift @keys;

=pod

struct {
	u8 TitleLength;
	u16* Title;

	u8 PropCount;
	struct {
		u8 KeyLength;
		u16* Key;
	}* PropNames;

	u16 ValueCount;
	struct {
		u8 WordLength;
		u16* Word;
		struct {
			u8 PropLength;
			u16* Property;
		} Properties[ProprCount];
	} Words[ValueCount];
}

=cut

	open my $o, '>', $filename_out or die "Error opening $filename_out: $!";
	binmode $o;

	print $o pack 'C/a*', encode('utf16-le', $title);

	print $o pack 'C', scalar @keys;
	print $o pack 'C/a*', encode('utf16-le', $_) for @keys;

	print $o pack 'S', scalar @data;
	for (@data) {
		print $o pack 'C/a*', encode('utf16-le', $_) for @$_[0..@keys]; # 1 more than @keys, because the first 'property' is the English name
	}

	close $o;
}

sub convert_phrases {

	my $filename_out = "$_[0].lng";
	my @data = @{$_[1]};

	# Allow simple error reporting
	our $errors = 0;
	sub error($) { print STDERR "$_[0]\n"; ++$errors; }

	# Split the input file on "--" lines, and store an array ref
	# for each section containing the lines

	my @phrase_lines;
	my $line_no = 3;
	for (@data) {
		push @phrase_lines, [ [ $line_no, $_->[0] ], [ $line_no, $_->[1] ] ];
		++$line_no;
	}

	# Build phrase_data, being a list of [ raw key, parsed key, parsed translation ]

	my @phrase_data;

	for (@phrase_lines) {
		next unless @$_;
		my $key = shift @$_;
		if (not @$_) {
			error "Error in line $key->[0]: no translation specified for key '$key->[1]'";
			next;
		}
		my $translation = join "\n", map $_->[1], @$_;

		push @phrase_data, [ $key->[1], parse_translation($key->[1], $translation) ];
	}

	if ($errors) {
		die "ABORTING: $::errors errors found\n";
	}

	#use Data::Dumper; print Dumper \@phrase_data; exit;

=pod

Disk format:

file {
	u16 phrase_count
	phrase* phrases;
}

phrase {
	u16 key_length;
	u16* key_string; // not null-terminated

	u8 variable_count; // just for validating its use

	u8 section_count;
	translation_section* sections;
}

translation_section {
	u8 type;

	// Type 0: (string)
	u16 length;
	u16* text; // not null-terminated

	// OR Type 1: (variable)
	u8 id; // referring to the position in the key phrase

	// OR Type 2: (function)
	u8 namelength;
	u8* name;
	u8 paramcount;
	func_param* params;
}

func_param {
	u8 type;

	// Type 0: (string)
	u8 length;
	u16* text;

	// OR Type 1: (variable)
	u8 id; // referring to the position in the key phrase

	// OR Type 2: (int)
	u32 value;

	// (doubles/etc will just be stringified)
}

=cut

	# Output the language file to disk

	open my $o, '>', $filename_out or die "Error opening $filename_out: $!";
	binmode $o;

	print $o pack 'S', scalar @phrase_data;
	for (@phrase_data) {
		# $_ eq [ key, var count, [ sections... ] ]

		my $key = encode('utf16-le', $_->[0]);
		print $o pack 'S/a*C', $key, $_->[1];

		print $o pack 'C', scalar @{ $_->[2] };

		for (@{ $_->[2] }) {
			# $_ eq [ type, data... ]

			if ($_->[0] eq 'str') {
				print $o pack 'CS/a*', 0, encode('utf16-le', $_->[1]);

			} elsif ($_->[0] eq 'var') {
				print $o pack 'CC', 1, $_->[1];

			} elsif ($_->[0] eq 'code') {
				my ($name, @params) = @{$_->[1]};
				print $o pack 'CC/a*C', 2, $name, scalar @params;

				for (@params) {

					if ($_->[0] eq 'str') {
						print $o pack 'CC/a*', 0, encode('utf16-le', $_->[1]);

					} elsif ($_->[0] eq 'var') {
						print $o pack 'CC', 1, $_->[1];

					} elsif ($_->[0] eq 'int') {
						print $o pack 'Cl', 2, $_->[1];

					} else {
						die "Invalid func param type $_->[0]";
					}

				}

			} else {
				die "Invalid sec type $_->[0]";
			}
		}
	}
}

sub parse_basic {
	# Parse "\$[a-zA-Z0-9_]+" (allowing "$$" to represent "$")
	my @parts;
	for (split /(\$(?:\$|[a-zA-Z0-9_]+))/, $_[0]) {
		if (/\$([a-zA-Z0-9_]+)/) {
			push @parts, [ var => $1 ];
		} else {
			$_ = '$' if $_ eq '$$';
			if (@parts and $parts[-1][0] eq 'str') {
				$parts[-1][1] .= $_;
			} else {
				push @parts, [ str => $_ ];
			}
		}
	}
	return \@parts;
}

sub parse_complex {
	# Parse "\$[a-zA-Z0-9_]+" (allowing "$$" to represent "$")
	# as well as [...] code lumps

	my @chars = split //, $_[0];
	my @parts = [ str => '' ];

	while (@chars) {
		$_ = shift @chars;
		if ($_ eq '$') {
			if (@chars==0 or $chars[0] eq '$') {
				$parts[-1][1] .= '$';
			} else {
				my $varname = '';
				while (@chars and $chars[0] =~ /[a-zA-Z0-9_]/) {
					$varname .= shift @chars;
				}
				push @parts, [ var => $varname ];
				push @parts, [ str => '' ];
			}
		} elsif ($_ eq '[') {
			my $codetext;
			while (@chars and $chars[0] ne ']') {
				$codetext .= shift @chars;
			}
			shift @chars;
			push @parts, [ code => parse_code($codetext) ];
			push @parts, [ str => '' ];

		} else {
			$parts[-1][1] .= $_;
		}
	}

	@parts = grep $_->[1] ne '', @parts;
	return \@parts;
}


sub parse_code {
	my @rawparts = split /,\s*/, $_[0];
	my $name = shift @rawparts;
	my @parts;
	for (@rawparts) {
		if (/^\d+$/) {
			push @parts, [ 'int' => $_ ];
		} elsif (/^\$(.*)/) {
			push @parts, [ 'var' => $1 ];
		} else {
			push @parts, [ 'str' => $_ ];
		}
	}
	return [ $name, @parts ];
}


sub parse_translation {
	my ($key, $str) = @_;

	# Parse key to extract the variable sections
	$key = parse_basic($key);

	# Fill variables with ( var0 => 0, var1 => 1, var2 => 2 )
	my %variables;
	for (@$key) {
		$variables{$_->[1]} = scalar keys %variables if $_->[0] eq 'var';
	}
	my %unused_variables = %variables;

	# Parse the translated string for
	$str = parse_complex($str);

	# Replace variable strings with the appropriate numbers
	for (@$str) {
		if ($_->[0] eq 'var') {
			if (not exists $variables{$_->[1]}) {
				die "Unrecognised variable '$_->[1]'\n";
			}
			delete $unused_variables{$_->[1]};
			$_->[1] = $variables{$_->[1]};

		} elsif ($_->[0] eq 'code') {

			for (@{$_->[1]}[1 .. $#{$_->[1]}]) {

				if ($_->[0] eq 'var') {
					if (not exists $variables{$_->[1]}) {
						die "Unrecognised variable '$_->[1]'\n";
					}
					delete $unused_variables{$_->[1]};
					$_->[1] = $variables{$_->[1]};
				}
			}
		}
	}
	for (keys %unused_variables) {
		warn "Warning: variable '$_' unreferenced in translated string\n";
	}

	return (scalar keys %variables, $str);
}