| from rex.utils.io import load_jsonlines | |
| def check_udi_instance(instance: dict): | |
| assert isinstance(instance["id"], str) | |
| assert isinstance(instance["instruction"], str) | |
| assert isinstance(instance["schema"], dict) | |
| for key in instance["schema"]: | |
| assert key in ["cls", "ent", "rel", "event"] | |
| if key in ["cls", "ent", "rel"]: | |
| assert isinstance(instance["schema"][key], list) and all( | |
| isinstance(x, str) for x in instance["schema"][key] | |
| ) | |
| elif key == "event": | |
| assert isinstance(instance["schema"][key], dict) | |
| for event_type in instance["schema"][key]: | |
| assert isinstance(instance["schema"][key][event_type], list) and all( | |
| isinstance(x, str) for x in instance["schema"][key][event_type] | |
| ) | |
| else: | |
| raise ValueError | |
| assert isinstance(instance["ans"], dict) | |
| for key in instance["ans"]: | |
| assert key in ["cls", "ent", "rel", "event", "span"] | |
| if key == "cls": | |
| assert isinstance(instance["ans"][key], list) and all( | |
| isinstance(x, str) for x in instance["ans"][key] | |
| ) | |
| elif key == "ent": | |
| assert isinstance(instance["ans"][key], list) and all( | |
| isinstance(x, dict) for x in instance["ans"][key] | |
| ) | |
| for ent in instance["ans"][key]: | |
| assert ( | |
| isinstance(ent["type"], str) | |
| and ent["type"] in instance["schema"]["ent"] | |
| ) | |
| assert ( | |
| isinstance(ent["text"], str) | |
| and instance["text"][ent["span"][0] : ent["span"][1]] == ent["text"] | |
| ) | |
| assert ( | |
| isinstance(ent["span"], list) | |
| and len(ent["span"]) == 2 | |
| and all(isinstance(x, int) for x in ent["span"]) | |
| ) | |
| elif key == "rel": | |
| assert isinstance(instance["ans"][key], list) and all( | |
| isinstance(x, dict) for x in instance["ans"][key] | |
| ) | |
| for rel in instance["ans"][key]: | |
| assert ( | |
| isinstance(rel["relation"], str) | |
| and rel["relation"] in instance["schema"]["rel"] | |
| ) | |
| assert ( | |
| isinstance(rel["head"], dict) | |
| and instance["text"][ | |
| rel["head"]["span"][0] : rel["head"]["span"][1] | |
| ] | |
| == rel["head"]["text"] | |
| ) | |
| assert ( | |
| isinstance(rel["tail"], dict) | |
| and instance["text"][ | |
| rel["tail"]["span"][0] : rel["tail"]["span"][1] | |
| ] | |
| == rel["tail"]["text"] | |
| ) | |
| elif key == "event": | |
| assert isinstance(instance["ans"][key], list) and all( | |
| isinstance(x, dict) for x in instance["ans"][key] | |
| ) | |
| for event in instance["ans"][key]: | |
| assert event["event_type"] in instance["schema"]["event"] | |
| assert ( | |
| isinstance(event["trigger"], dict) | |
| and event["trigger"]["text"] in instance["text"] | |
| and instance["text"][ | |
| event["trigger"]["span"][0] : event["trigger"]["span"][1] | |
| ] | |
| == event["trigger"]["text"] | |
| ) | |
| for arg in event["args"]: | |
| assert ( | |
| arg["role"] in instance["schema"]["event"][event["event_type"]] | |
| ) | |
| assert ( | |
| isinstance(arg["text"], str) | |
| and instance["text"][arg["span"][0] : arg["span"][1]] | |
| == arg["text"] | |
| ) | |
| elif key == "span": | |
| assert isinstance(instance["ans"][key], list) and all( | |
| isinstance(x, dict) for x in instance["ans"][key] | |
| ) | |
| for span in instance["ans"][key]: | |
| assert ( | |
| isinstance(span["text"], str) | |
| and instance["text"][span["span"][0] : span["span"][1]] | |
| == span["text"] | |
| ) | |
| else: | |
| raise ValueError | |
| assert isinstance(instance["text"], str) | |
| assert isinstance(instance["bg"], str) | |
| for key in ["ent", "rel", "event"]: | |
| if instance["schema"].get(key): | |
| assert len(instance["text"]) > 0 | |
| if "span" in instance["ans"]: | |
| assert len(instance["text"]) > 0 | |
| assert instance["instruction"] or instance["text"] or instance["bg"] | |
| def is_valid_udi_instance(instance: dict): | |
| ok = True | |
| try: | |
| check_udi_instance(instance) | |
| except: | |
| ok = False | |
| return ok | |
| def main(): | |
| filepaths = [] | |
| for filepath in filepaths: | |
| data = load_jsonlines(filepath) | |
| data_ok = True | |
| for ins in data: | |
| ok = is_valid_udi_instance(ins) | |
| if not ok: | |
| data_ok = False | |
| break | |
| if not data_ok: | |
| print(filepath) | |
| if __name__ == "__main__": | |
| main() | |