aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorarie <arie@alleycat.cc>2021-05-26 12:32:19 +0200
committerarie <arie@alleycat.cc>2021-06-28 21:01:35 +0200
commit1d297b2741751c6a63cfffe5a51016d90c49db2c (patch)
tree240048ac2106068e050a04c4975597a55836421f
parent105a7eff0ccffa9b0df07d0615b7b75314934e3a (diff)
Rdf-turtle:
implement more parsers for the turtle type. Work in progress. Probably a lot of bugs will show up during testing, but having tests its easier to fix those, so first writing the tests is best now. See the comments about the mutually recursive parser. Hopefully that works ...
-rw-r--r--lib/turtle/rdf_turtle.ml363
-rw-r--r--lib/turtle/rdf_turtle.mli207
2 files changed, 401 insertions, 169 deletions
diff --git a/lib/turtle/rdf_turtle.ml b/lib/turtle/rdf_turtle.ml
index 83c7aea..df468b3 100644
--- a/lib/turtle/rdf_turtle.ml
+++ b/lib/turtle/rdf_turtle.ml
@@ -13,91 +13,94 @@ end
module SMap = Map.Make (Ordered_string)
-module Iriref = struct
- type t = string
- let of_string s = s
-end
-
-module Language = struct
- type t = string
- let of_string s = s
-end
-
-module Prefixed_name = struct
- type t = string * string
- let of_strings s1 s2 = (s1, s2)
-end
-
-module Blank_node = struct
- type t = string
- let of_string s = s
-end
-
-module Iri = struct
- type t = Iriref of Iriref.t | Prefixed_name of Prefixed_name.t
- let of_iriref ref = Iriref ref
- let of_prefixed_name pname = Prefixed_name pname
-end
-
-module Literal = struct
- type t = {
- value: string;
- language: string option;
- datatype: Iri.t;
+module AST = struct
+
+ module Iriref = struct
+ type t = string
+ let of_string s = s
+ end
+
+ module Language = struct
+ type t = string
+ let of_string s = s
+ end
+
+ module Prefixed_name = struct
+ type t = string * string
+ let of_strings s1 s2 = (s1, s2)
+ end
+
+ module Blank_node = struct
+ type t = string
+ let of_string s = s
+ end
+
+ module Iri = struct
+ type t = Iriref of Iriref.t | Prefixed_name of Prefixed_name.t
+ let of_iriref ref = Iriref ref
+ let of_prefixed_name pname = Prefixed_name pname
+ end
+
+ module Literal = struct
+ type t = {
+ value: string;
+ language: string option;
+ datatype: Iri.t;
+ }
+
+ let make value ?language datatype =
+ { value; datatype; language }
+
+ let canonical literal =
+ literal.value
+
+ let datatype literal =
+ literal.datatype
+
+ let language literal =
+ literal.language
+ end
+
+ module Predicate = struct
+ type t = Pred_iri of Iri.t | Pred_a
+ let of_iri iri = Pred_iri iri
+ end
+
+ type object' =
+ Obj_iri of Iri.t
+ | Obj_blank_node of Blank_node.t
+ | Obj_literal of Literal.t
+ | Obj_coll of collection
+ | Obj_BnodPs of bnodep list
+ and collection =
+ Collection of object' list
+ and subject =
+ Sub_iri of Iri.t
+ | Sub_blank_node of Blank_node.t
+ | Sub_coll of collection
+ and bnodep = BNodeP of predobjs
+ and predobjs = (Predicate.t * object' list) list
+
+ type triples =
+ SubjPredObjs of subject * predobjs
+ | BNodePs of (bnodep list)
+
+ type directive = PrefixID of string * Iriref.t | Base of Iriref.t
+
+ type statement = Directive of directive | Triples of triples
+
+ type turtle = statement list
+
+ type parser_state = {
+ base_uri : Iri.t;
+ namespaces : Iri.t SMap.t;
+ bnode_labels : Blank_node.t SMap.t;
+ cur_subject: subject;
+ cur_predicate: Predicate.t;
}
- let make value ?language datatype =
- { value; datatype; language }
-
- let canonical literal =
- literal.value
-
- let datatype literal =
- literal.datatype
-
- let language literal =
- literal.language
end
-module Predicate = struct
- type t = Pred_iri of Iri.t | Pred_a
- let of_iri iri = Pred_iri iri
-end
-
-type object' =
- Obj_iri of Iri.t
- | Obj_blank_node of Blank_node.t
- | Obj_literal of Literal.t
- | Obj_coll of collection
- | Obj_BnodPs of bnodep list
-and collection =
- Collection of object' list
-and subject =
- Sub_iri of Iri.t
- | Sub_blank_node of Blank_node.t
- | Sub_coll of collection
-and bnodep = BNodeP of predobj
-and predobj = Predicate.t * object' list
-
-type triples =
- SubjPredObjs of (subject * predobj list)
- | BNodePs of (bnodep list)
-
-type directive = PrefixID of string * Iriref.t | Base of Iriref.t
-
-type statement = Directive of directive | Triples of triples
-
-type turtle = statement list
-
-type parser_state = {
- base_uri : Iri.t;
- namespaces : Iri.t SMap.t;
- bnode_labels : Blank_node.t SMap.t;
- cur_subject: subject;
- cur_predicate: Predicate.t;
-}
-
-
module Parser = struct
open Angstrom
@@ -127,44 +130,226 @@ module Parser = struct
let iriref =
lift
- Iriref.of_string
+ AST.Iriref.of_string
(delimiters '<' '>')
let prefixed_name =
lift2
- Prefixed_name.of_strings
+ AST.Prefixed_name.of_strings
(take_while (char_is_not_equal_to ':')
<* char ':')
(take_while is_not_whitespace)
let language =
lift
- Language.of_string
+ AST.Language.of_string
(char '@'
*> take_while is_not_whitespace)
let blank_node =
lift
- Blank_node.of_string
+ AST.Blank_node.of_string
(char '_'
*> char ':'
*> take_while is_not_whitespace)
let iri =
- (lift Iri.of_iriref iriref)
+ (lift AST.Iri.of_iriref iriref)
<|>
- (lift Iri.of_prefixed_name prefixed_name)
+ (lift AST.Iri.of_prefixed_name prefixed_name)
let literal =
lift3
(fun value lang_opt iri -> match lang_opt with
- | "" -> Literal.make value iri
- | lang -> Literal.make value ~language:lang iri)
+ | "" -> AST.Literal.make value iri
+ | lang -> AST.Literal.make value ~language:lang iri)
(delimiters '"' '"')
(char '@'
*> (take_while (char_is_not_equal_to ':'))
)
(string "^^" *> iri)
+ let predicate =
+ lift AST.Predicate.of_iri iri
+
+ let object_ collection bnodep =
+ choice [
+ (lift
+ (fun iri -> AST.Obj_iri iri)
+ iri);
+ (lift
+ (fun blank_node -> AST.Obj_blank_node blank_node)
+ blank_node
+ );
+ (lift
+ (fun literal -> AST.Obj_literal literal)
+ literal
+ );
+ (lift
+ (fun collection -> AST.Obj_coll collection)
+ collection
+ );
+ (lift
+ (fun bnodeps -> AST.Obj_BnodPs bnodeps)
+ (many bnodep)
+ )
+ ]
+
+ let collection_ bnodep =
+ fix (fun collection ->
+ let object' = object_ collection bnodep in
+ (lift
+ (fun collection -> AST.Collection collection)
+ (char '('
+ *>
+ (many object')
+ <* char ')'
+ )
+ )
+ )
+
+ let subject_ bnodep =
+ let collection = collection_ bnodep in
+ choice [
+ (lift
+ (fun iri -> AST.Sub_iri iri)
+ (iri)
+ );
+ (lift
+ (fun blank_node -> AST.Sub_blank_node blank_node)
+ (blank_node)
+ );
+ (lift
+ (fun collection -> AST.Sub_coll collection)
+ (collection)
+ )
+ ]
+
+ let bnodep_ predobjs =
+ lift
+ (fun predobjs -> AST.BNodeP predobjs)
+ ( char '['
+ *> predobjs
+ <* char ']'
+ )
+
+ let predobjs =
+ let semicolon =
+ whitespace
+ *> char ';'
+ <* whitespace
+ in
+ let comma =
+ whitespace
+ *> char ','
+ <* whitespace
+ in
+ fix (fun predobjs ->
+ let bnodep = bnodep_ predobjs in
+ let collection = collection_ bnodep in
+ let object' = object_ collection bnodep in
+ sep_by1 semicolon (
+ lift2
+ (fun p objs -> (p, objs))
+ predicate
+ (sep_by1 comma object')
+ )
+ )
+
+ let bnodep = bnodep_ predobjs
+ let subject = subject_ bnodep
+ let collection = collection_ bnodep
+ let object' = object_ collection bnodep
+
+ let triples =
+ choice [
+ (lift2
+ (fun subject predobjs -> AST.SubjPredObjs (subject, predobjs))
+ subject
+ predobjs);
+ (lift
+ (fun bnodeps -> AST.BNodePs bnodeps)
+ (many bnodep)
+ )
+ ]
+
+ let directive =
+ choice [
+ lift2
+ (fun str iriref -> AST.PrefixID (str, iriref))
+ (string "@prefix"
+ *> whitespace
+ *> (take_while (char_is_not_equal_to ':'))
+ <* char ':'
+ <* whitespace)
+ (iriref
+ <* whitespace
+ <* char '.')
+ ;
+ lift
+ (fun iriref -> AST.Base iriref)
+ (string "@base"
+ *> iriref
+ <* whitespace
+ <* char '.'
+ )
+ ]
+
+ let statement =
+ choice [
+ lift
+ (fun directive -> AST.Directive directive)
+ (directive)
+ ;
+ lift
+ (fun triples -> AST.Triples triples)
+ (triples
+ <* whitespace
+ <* char '.'
+ )
+ ]
+
+ let turtle =
+ many statement
+
+
+ (*
+ This is the idea for mutually recursive parsers (because Angstrom doesn't have a 'fix_poly'. The 'fix' function allows recursion, but the definition of the function can only depend on itself, not on other functions. That's why we have
+let a b c d = ...
+We just make a function a that depends on b, c and d, and later we can compute a if the variables b, c and d are available.
+In principle, you have as many parsers as you like, but it becomes rather lengthy at some point.
+ Note that a plays two different roles here, namely that of the function that sends the parsers b, c and d to the parser a, and the parser a itself.
+In the implementation, we chose a_ (or for example object_) for the function.
+Because some of the types did not depend on all the other types, our functions are simpler (but it is harder to see the general pattern).
+
+ let a b c d = fix (fun a -> (* definition of `a` in terms of `a` and `b` and `c` and `d`*))
+
+ let b c d =
+ fix (fun b ->
+ let a = a b c d in
+ (* definition of b in terms of `a` and `b` and `c` and `d`*)
+
+ let c d =
+ fix (fun c ->
+ let b = b c d in
+ let a = a b c d in
+ (* definition of c in terms of `a` and `b` and `c` and `d`*)
+ )
+
+ let d =
+ fix (fun d ->
+ let c = c d in
+ let b = b c d in
+ let a = a b c d in
+ (* definition of c in terms of `a` and `b` and `c` and `d`*)
+ )
+
+ let c = c d
+
+ let b = b c d
+
+ let a = a b c d
+ *)
+
end
diff --git a/lib/turtle/rdf_turtle.mli b/lib/turtle/rdf_turtle.mli
index 4c4ccff..941f039 100644
--- a/lib/turtle/rdf_turtle.mli
+++ b/lib/turtle/rdf_turtle.mli
@@ -8,107 +8,154 @@
module SMap : Map.S with type key = string
-module Iriref : sig
-(* we take a string for an iriref *)
- type t = string
- val of_string : string -> t
-end
+module AST : sig
-module Language : sig
- type t = string
- val of_string : string -> t
-end
+ module Iriref : sig
-(* The prefixed name is given by a PNAME_NS and PNAME_LN *)
-(* This is the namespace and the local part *)
-module Prefixed_name : sig
- type t = string * string
- val of_strings : string -> string -> t
-end
+ (* we take a string for an iriref *)
+ type t = string
-module Blank_node : sig
- type t = string
- val of_string : string -> t
-end
+ val of_string : string -> t
-module Iri : sig
- type t = Iriref of Iriref.t | Prefixed_name of Prefixed_name.t
- val of_iriref : Iriref.t -> t
- val of_prefixed_name : Prefixed_name.t -> t
-end
+ end
+
+ module Language : sig
+
+ type t = string
+
+ val of_string : string -> t
+
+ end
+
+ (* The prefixed name is given by a PNAME_NS and PNAME_LN *)
+ (* This is the namespace and the local part *)
+ module Prefixed_name : sig
+
+ type t = string * string
+
+ val of_strings : string -> string -> t
+
+ end
+
+ module Blank_node : sig
+
+ type t = string
+
+ val of_string : string -> t
+
+ end
+
+ module Iri : sig
+
+ type t = Iriref of Iriref.t | Prefixed_name of Prefixed_name.t
+
+ val of_iriref : Iriref.t -> t
+
+ val of_prefixed_name : Prefixed_name.t -> t
+
+ end
(* The language option indicates a language, the iri option inidicates the data type *)
-module Literal : sig
- type t
- (** A literal. *)
+(* Question: if the data type is absent in the literal, the default is xsd:string. *)
+(* So we could decide to always have a datatype (so to not use an Iri option). Zoggy uses *)
+(* an Iri option though, maybe because that's closer to the exact specification of a turtle? *)
- val make : string -> ?language:string -> Iri.t -> t
- (** Create a new literal. *)
+ module Literal : sig
+ type t
+ (** A literal. *)
- val canonical : t -> string
- (** [canonical literal] returns the canonical representation of the literal. *)
+ val make : string -> ?language:string -> Iri.t -> t
+ (** Create a new literal. *)
- val datatype : t -> Iri.t
- (** [datatype literal] returns the datatype of the literal. *)
+ val canonical : t -> string
+ (** [canonical literal] returns the canonical representation of the literal. *)
- val language : t -> string option
- (** [language literal] returns the language tag of the literal. *)
-end
+ val datatype : t -> Iri.t
+ (** [datatype literal] returns the datatype of the literal. *)
-(* We omit the 'verb' type, since this predicate type encompasses the same space *)
-module Predicate : sig
- type t = Pred_iri of Iri.t | Pred_a
- val of_iri : Iri.t -> t
-end
+ val language : t -> string option
+ (** [language literal] returns the language tag of the literal. *)
+ end
+
+ (* We omit the 'verb' type, since this predicate type encompasses the same space *)
+ module Predicate : sig
+
+ type t = Pred_iri of Iri.t | Pred_a
+
+ val of_iri : Iri.t -> t
+
+ end
+
+ (* The types below are (recursive, what is it called?). The 'and' indicates this. *)
+ type object' =
+ Obj_iri of Iri.t
+ | Obj_blank_node of Blank_node.t
+ | Obj_literal of Literal.t
+ | Obj_coll of collection
+ | Obj_BnodPs of bnodep list
+ and collection =
+ Collection of object' list
+ and subject =
+ Sub_iri of Iri.t
+ | Sub_blank_node of Blank_node.t
+ | Sub_coll of collection
+ and bnodep = BNodeP of predobjs
+ and predobjs = (Predicate.t * object' list) list
+
+ type triples =
+ SubjPredObjs of subject * predobjs
+ | BNodePs of (bnodep list)
+
+ type directive = PrefixID of string * Iriref.t | Base of Iriref.t
+
+ type statement = Directive of directive | Triples of triples
+
+ type turtle = statement list
+
+ type parser_state = {
+ base_uri : Iri.t;
+ namespaces : Iri.t SMap.t;
+ bnode_labels : Blank_node.t SMap.t;
+ cur_subject: subject;
+ cur_predicate: Predicate.t;
+ }
-(* The types below are (recursive, what is it called?). The 'and' indicates this. *)
-type object' =
- Obj_iri of Iri.t
- | Obj_blank_node of Blank_node.t
- | Obj_literal of Literal.t
- | Obj_coll of collection
- | Obj_BnodPs of bnodep list
-and collection =
- Collection of object' list
-and subject =
- Sub_iri of Iri.t
- | Sub_blank_node of Blank_node.t
- | Sub_coll of collection
-and bnodep = BNodeP of predobj
-and predobj = Predicate.t * object' list
-
-type triples =
- SubjPredObjs of (subject * predobj list)
- | BNodePs of (bnodep list)
-
-type directive = PrefixID of string * Iriref.t | Base of Iriref.t
-
-type statement = Directive of directive | Triples of triples
-
-type turtle = statement list
-
-type parser_state = {
- base_uri : Iri.t;
- namespaces : Iri.t SMap.t;
- bnode_labels : Blank_node.t SMap.t;
- cur_subject: subject;
- cur_predicate: Predicate.t;
-}
+end
module Parser : sig
val whitespace : unit Angstrom.t
- val iriref : Iriref.t Angstrom.t
+ val iriref : AST.Iriref.t Angstrom.t
+
+ val prefixed_name : AST.Prefixed_name.t Angstrom.t
+
+ val language : AST.Language.t Angstrom.t
+
+ val blank_node : AST.Blank_node.t Angstrom.t
+
+ val iri : AST.Iri.t Angstrom.t
+
+ val literal : AST.Literal.t Angstrom.t
+
+ val predicate : AST.Predicate.t Angstrom.t
+
+ val predobjs : AST.predobjs Angstrom.t
+
+ val bnodep : AST.bnodep Angstrom.t
+
+ val subject : AST.subject Angstrom.t
+
+ val collection : AST.collection Angstrom.t
- val prefixed_name : Prefixed_name.t Angstrom.t
+ val object' : AST.object' Angstrom.t
- val language : Language.t Angstrom.t
+ val triples : AST.triples Angstrom.t
- val blank_node : Blank_node.t Angstrom.t
+ val directive : AST.directive Angstrom.t
- val iri : Iri.t Angstrom.t
+ val statement : AST.statement Angstrom.t
- val literal : Literal.t Angstrom.t
+ val turtle : AST.turtle Angstrom.t
end