Commit 1ee41a25 authored by Alexandru Dura's avatar Alexandru Dura
Browse files

Earley recognizer is complete.

parent ba08a508
public class EarleyItem {
int dot; // 0 means before the first element in the rule
final int start; // 0 means beginning of input
final EarleyRule rule;
public EarleyItem(EarleyRule rule, int start) {
this.dot = 0;
this.start = start;
this.rule = rule;
}
public int afterDot() {
assert !isComplete();
return rule.body[dot];
}
public boolean isComplete() {
return this.rule.body.length == this.dot;
}
public EarleyItem advance() {
assert !isComplete();
EarleyItem ret = new EarleyItem(rule, start);
ret.dot = dot + 1;
return ret;
}
@Override public boolean equals(Object other) {
if (!(other instanceof EarleyItem))
return false;
EarleyItem e = (EarleyItem) other;
return dot == e.dot && start == e.start
&& rule == e.rule; // reference equality here!
}
@Override public int hashCode() {
return (rule.hashCode() + (dot * 31)) * 31 + start;
}
}
......@@ -2,8 +2,12 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeSet;
import java.util.HashSet;
import java.util.LinkedList;
public class EarleyParser {
private boolean DEBUG = true;
private HashMap<Category, Integer> cat2int;
private HashMap<Integer, Category> int2cat;
......@@ -34,6 +38,10 @@ public class EarleyParser {
}
}
static boolean isTerminal(int cat) {
return cat < 0;
}
public void addRule(Rule r) {
ArrayList<Rule> list = grammarRules.get(r.getHead());
if (list == null) {
......@@ -61,23 +69,183 @@ public class EarleyParser {
}
}
private String asString(EarleyRule rule) {
String s = int2cat.get(rule.head).toString() + " -> ";
for (int j : rule.body) {
s += int2cat.get(j).toString() + " ";
}
return s;
}
private String asString(EarleyItem item) {
String s = int2cat.get(item.rule.head).toString() + " -> ";
for (int j = 0; j < item.rule.body.length; ++j) {
if (j == item.dot) {
s += "\u2022 ";
}
int symbol = item.rule.body[j];
s += int2cat.get(symbol).toString() + " ";
}
if (item.dot == item.rule.body.length) {
s += "\u2022";
}
s += "(" + item.start + ")";
return s;
}
public String toString() {
String s = "";
for (int i = 1; i < rules.size(); ++i) {
TreeSet<EarleyRule> rs = rules.get(i);
for (EarleyRule r : rs) {
s += int2cat.get(i).toString() + " -> ";
assert r.head == i;
for (int j : r.body) {
s += int2cat.get(j).toString() + " ";
}
s += asString(r);
s += "\n";
}
}
return s;
}
public void parse(Category s[]) {
class StateSet extends HashSet<EarleyItem> {
}
/**
@param symbols - a zero terminated array of symbols
*/
private StateSet[] internalParse(int[] symbols, int startSymbol) {
StateSet[] state = new StateSet[symbols.length + 1];
state[0] = new StateSet();
for (EarleyRule r : rules.get(startSymbol)) {
state[0].add(new EarleyItem(r, 0));
}
for (int i = 0; i < symbols.length; ++i) {
StateSet currentSet = state[i];
state[i + 1] = new StateSet();
StateSet nextSet = state[i + 1];
LinkedList<EarleyItem> worklist = new LinkedList<>(currentSet);
while (!worklist.isEmpty()) {
EarleyItem item = worklist.removeFirst();
if (item.isComplete()) {
// COMPLETION
// TODO: we're iterating over items in a parent set here. This is O(n_items).
// We can improve this by storing the set as a tree set, which would give
// a complexity of O(log(n_items)) for this iteration and also for insertion.
for (EarleyItem jtem : state[item.start]) {
if (!jtem.isComplete() && jtem.afterDot() == item.rule.head) {
EarleyItem newItem = jtem.advance();
if (currentSet.add(newItem)) {
worklist.addLast(newItem);
}
}
}
} else if (isTerminal(item.afterDot())) {
// SCAN
if (item.afterDot() == symbols[i]) {
// we have a match, advance
EarleyItem newItem = item.advance();
nextSet.add(newItem);
} else {
// do nothing
}
} else {
// PREDICTION:
// non-terminal after dot
for (EarleyRule r : rules.get(item.afterDot())) {
EarleyItem newItem = new EarleyItem(r, i);
if (currentSet.add(newItem)) {
// the item was not existing in the set, add it to the worklist
worklist.addLast(newItem);
}
}
}
}
}
return state;
}
public boolean recognize(Category s[], Category startSymbol) {
StateSet[] state = new StateSet[s.length + 2];
int start = cat2int.get(startSymbol);
int[] symbols = new int[s.length + 1];
for (int i = 0; i < s.length; ++i)
symbols[i] = cat2int.get(s[i]);
symbols[s.length] = 0;
state[0] = new StateSet();
for (EarleyRule r : rules.get(start)) {
state[0].add(new EarleyItem(r, 0));
}
for (int i = 0; i < s.length + 1; ++i) {
StateSet currentSet = state[i];
state[i + 1] = new StateSet();
StateSet nextSet = state[i + 1];
LinkedList<EarleyItem> worklist = new LinkedList<>(currentSet);
while (!worklist.isEmpty()) {
EarleyItem item = worklist.removeFirst();
if (item.isComplete()) {
// COMPLETION
// TODO: we're iterating over items in a parent set here. This is O(n_items).
// We can improve this by storing the set as a tree set, which would give
// a complexity of O(log(n_items)) for this iteration and also for insertion.
for (EarleyItem jtem : state[item.start]) {
if (!jtem.isComplete() && jtem.afterDot() == item.rule.head) {
EarleyItem newItem = jtem.advance();
if (currentSet.add(newItem)) {
worklist.addLast(newItem);
}
}
}
} else if (isTerminal(item.afterDot())) {
// SCAN
if (item.afterDot() == symbols[i]) {
// we have a match, advance
EarleyItem newItem = item.advance();
nextSet.add(newItem);
} else {
// do nothing
}
} else {
// PREDICTION:
// non-terminal after dot
for (EarleyRule r : rules.get(item.afterDot())) {
EarleyItem newItem = new EarleyItem(r, i);
if (currentSet.add(newItem)) {
// the item was not existing in the set, add it to the worklist
worklist.addLast(newItem);
}
}
}
}
}
if (DEBUG) {
for (int i = 0; i < s.length + 1; ++i) {
System.out.println("=== Item set at position " + i + " ===");
for (EarleyItem item : state[i]) {
if (item.isComplete())
System.out.println(asString(item));
}
}
}
StateSet finalState = state[s.length];
System.out.println("===========================");
for (EarleyItem item : finalState) {
if (item.isComplete() && item.start == 0 && item.rule.head == start) {
return true;
}
}
return false;
}
}
public class EarleyRule implements Comparable<EarleyRule> {
int body[];
int head;
final int body[];
final int head;
public EarleyRule(int head, int body[]) {
this.head = head;
this.body = body;
......
......@@ -3,15 +3,19 @@ import static org.junit.Assert.*;
import org.junit.Test;
public class EarleyParserTest {
@Test public void testToString() {
Category num = new Category("NUM", true);
Category var = new Category("VAR", true);
Category plus = new Category("+", true);
Category times = new Category("*", true);
Category num = new Category("NUM", true);
Category var = new Category("VAR", true);
Category plus = new Category("+", true);
Category times = new Category("*", true);
Category metaNum = new Category("META_NUM", true);
Category metaVar = new Category("META_VAR", true);
Category s = new Category("s", false);
Category p = new Category("p", false);
Category t = new Category("t", false);
Category s = new Category("s", false);
Category p = new Category("p", false);
Category t = new Category("t", false);
EarleyParser makeParser() {
EarleyParser parser = new EarleyParser();
parser.addCategory(num);
......@@ -31,9 +35,59 @@ public class EarleyParserTest {
parser.done();
System.out.println(parser);
return parser;
}
@Test public void testToString() {
EarleyParser parser = makeParser();
System.out.println(parser);
assertEquals("s -> s <+> p \ns -> p \np -> p <*> t \np -> t \nt -> <VAR> \nt -> <NUM> \n",
parser.toString());
}
@Test public void testParse1() {
Category str[] = {num, plus, var};
EarleyParser parser = makeParser();
assertTrue(parser.recognize(str, s));
}
@Test public void testParse2() {
Category str[] = {num, plus, plus};
EarleyParser parser = makeParser();
assertFalse(parser.recognize(str, s));
}
EarleyParser makeAmbiguousParser() {
EarleyParser parser = new EarleyParser();
parser.addCategory(num);
parser.addCategory(var);
parser.addCategory(plus);
parser.addCategory(times);
parser.addCategory(metaVar);
parser.addCategory(s);
parser.addCategory(p);
parser.addCategory(t);
parser.addRule(new Rule(s, s, plus, p));
parser.addRule(new Rule(s, p));
parser.addRule(new Rule(p, p, times, t));
parser.addRule(new Rule(p, t));
parser.addRule(new Rule(t, num));
parser.addRule(new Rule(t, var));
parser.addRule(new Rule(t, metaVar));
parser.addRule(new Rule(p, metaVar));
parser.addRule(new Rule(s, metaVar));
parser.done();
return parser;
}
@Test public void testParse3() {
Category str[] = {metaVar, plus, metaVar};
EarleyParser parser = makeAmbiguousParser();
assertTrue(parser.recognize(str, s));
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment