Parsing XML
01.01.2007
{ Here I will show one way to parse XML document The main concept of XML is using containers for XML objects - so we will use Tree concept while building our XML object from XML document. XML text uses containers (<TAG ... >...</TAG>) or simple definitions (<TAG ... />) in each TAG we can use parameters (<TAG key1="value1" key2="value2">... <SIMPLE key3="value3"> ...</TAG>) Finally we will have an array of objects, describing XML tags. Every object of this class will have an array of children if needed, and a hash to describe properties of it. For example if we have a text <UL name="xxx"> <LI name="xxx1"/> <LI name="xxx2"/> <LI name="xxx3"/> <LI name="xxx4"/> </UL> we will have one root object (named "UL") in wich we will have 4 children (named "LI" with different sets of properties - from "NAME"="xxx1" to "NAME"="xxx4") This is not a trivial task - so we will make a unit to solve this... I will try to comment some here... if you have any comments for this unit - write to me: sunworx@mail.ru; yz@infoteh.ru } unit YZXMLParser; interface uses SysUtils, ComCtrls; type THashElement = record Key, Value: string; end; type THashElementArr = array of THashElement; // here we declare a THash class to use in our parser // The concept of THash is to retreive named values from an array // Hash is an array where index is a string (example V[Key]=value, // whehe Key and Value are of type string) // The main purpose of this class is to rerurn a value of a String-named key //(example: s:=hash['someValue']) // the description of a hash element we use type THash = class(TObject) private Arr: THashElementArr; function GetValue(Key: string): string; procedure SetValue(Key: string; const VValue: string); function GetKeys: StrArr; function GetValues: StrArr; function GetCount: Integer; function Getempty: Boolean; public property Value[Key: string]: string read GetValue write SetValue; default; property Values: StrArr read GetValues; property Keys: StrArr read GetKeys; property Count: Integer read GetCount; property Empty: Boolean read Getempty; procedure Clear; constructor Create; destructor Destroy; override; end; TYZHash = THash; type // Here we declare some definitions for our parser to know what // identifier we would receive next in our text // these values will be used in the result of WhatNext() function which will scan text for keys TYZXMLMarker = (xmlOpenTag, xmlCloseTagShort, xmlCloseTag, xmlCloseTagLong, xmlEOF, xmlIdentifier, xmlunknown); / * Because we use recursive definition of our class(as TreeView, where we declare children of the same type in opur type declaration) we must use forward declaration * / // The definition of a TAG class TYZXMLTag = class; TYZXMLTags = array of TYZXMLTag; TYZXMLTag = class(TObject) private FData: TYZHash; FParent: TYZXMLTag; FName: string; function GetValue(AName: string): string; procedure SetName(const Value: string); procedure SetValue(AName: string; const Value: string); function GetCount: Integer; function GetValueNames: strarr; public Children: TYZXMLTags; // these are our child nodes Text: string; property Name: string read FName write SetName; // name of a tag property Values[AName: string]: string read GetValue write SetValue; default; // values of properties of a tag (hash values) property ValueNames: strarr read GetValueNames; // array of strings returniong names of all props of this tag property Count: Integer read GetCount; // a count of children of a tag (if this tag is a container) function SkipSpaces(var AData: string; var APos: Integer; RememberBreaks: Boolean = False): Char; // internal. for skip spaces (also CR or LF or other non-text chars) while parsing text function ParseValue(var AData: string; var APos: Integer): Boolean; // parse value (calling when found a parameter of a tag) function ParseName(var AData: string; var APos: Integer): Boolean; // parse key of parameter in a tag // these two procs used to parse any text found while parsing XML function ParseString(var AData: string; var APos: Integer; RememberBreaks: Boolean = False): string; function ParseQuotedString(var AData: string; var APos: Integer; QIndef: Char = '"'): string; // returnes the type of next identifier in XML function WhatNext(var AData: string; var APos: Integer; var ANext: Integer; RememberBreaks: Boolean = False): TYZXMLMarker; // This is a main procedure of our class - AData is a string, // containing all XML data (you can use TMemo.Text, for example, as a parameter of AData) function ParseXML(var AData: string; var APos: Integer): Boolean; // This function returnes a text string, built based on data, stored in an object. function GenerateXML(var AData: string; ATab: string = ''): Boolean; // returnes char from string at specified pos (#0 if not in range) function CharAt(var S: string; APos: Integer): Char; function TagNameExists(AName: string): Boolean; // Adds a child to children array of a current tag function AddChild: TYZXMLTag; // Initializes current tag and deletes all existing children procedure Clear; virtual; constructor Create(AParent: TYZXMLTag); virtual; destructor Destroy; virtual; end; type TYZXMLParser = class(TYZXMLTag) private Header: TYZHash; procedure _BuildTreeView(ATreeView: TTreeView; ANode: TTreeNode; ATag: TYZXMLTag); public property HeaderValues: TYZHash read Header; procedure BuildTreeView(ATreeView: TTreeView); function Parse(AData: string): Boolean; function Generate(var AData: string): Boolean; constructor Create; destructor Destroy; end; implementation //============================================================================== { TYZXMLTag } function TYZXMLTag.AddChild: TYZXMLTag; begin setlength(children, Length(children) + 1); Result := TYZXMLTag.Create(Self); children[Length(children) - 1] := Result; end; //------------------------------------------------------------------------------ procedure TYZXMLTag.Clear; var i: Integer; begin for i := 0 to Count - 1 do if children[i] <> nil then Children[i].Destroy; setlength(children, 0); FData.Clear; Text := ''; end; //------------------------------------------------------------------------------ constructor TYZXMLTag.Create(AParent: TYZXMLTag); begin inherited Create; FData := TYZHash.Create; FParent := AParent; Clear; end; //------------------------------------------------------------------------------ destructor TYZXMLTag.Destroy; begin Clear; FData.Destroy; end; //------------------------------------------------------------------------------ function TYZXMLTag.GetCount: Integer; begin Result := Length(children); end; //------------------------------------------------------------------------------ function TYZXMLTag.GetValue(AName: string): string; begin Result := FData[AName]; end; //------------------------------------------------------------------------------ function TYZXMLTag.ParseName(var AData: string; var APos: Integer): Boolean; begin Result := False; FName := ParseString(AData, APos); if fname = '' then Exit; Result := True; end; //------------------------------------------------------------------------------ function TYZXMLTag.ParseQuotedString(var AData: string; var APos: Integer; QIndef: Char = '"'): string; var i: Integer; skipnext: Boolean; z: Char; begin Result := ''; if CharAt(AData, APos) <> QIndef then Exit; i := apos; skipnext := True; repeat if not skipnext then begin if charat(AData, I) = '\' then SkipNext := True else begin z := charat(AData, I); if (Z = QIndef) or (z = #0) then begin Result := Copy(AData, aPos + 1, I - APos - 1); // result:=exch(result,'\',''); APos := I + 1; Exit; end; end; end else skipnext := False; Inc(i); until False; end; //------------------------------------------------------------------------------ function TYZXMLTag.ParseString(var AData: string; var APos: Integer; RememberBreaks: Boolean = False): string; const extsym: string = '=<>;?*/'; var nxt: Char; x1, x2, i: Integer; begin Result := ''; nxt := SkipSpaces(AData, APos, RememberBreaks); if nxt = #0 then Exit; if (nxt = '"') or (nxt = '''') then begin Result := ParseQuotedString(AData, APos); Exit; end; x1 := APos; i := x1; nxt := CharAt(AData, i); while ((Ord(nxt) <= 32) or (Pos(nxt, extsym) > 0)) and (nxt <> #0) do begin Inc(i); nxt := CharAt(AData, i); end; APos := i; X1 := APos; while (Ord(nxt) > 32) and (Pos(nxt, extsym) <= 0) do begin Inc(i); nxt := CharAt(AData, i); end; x2 := i - x1; Result := Copy(AData, x1, x2); APos := i; end; //------------------------------------------------------------------------------ function TYZXMLTag.ParseValue(var AData: string; var APos: Integer): Boolean; var n, v: string; i, x: Integer; begin Result := False; n := parseString(AData, APos); if n = '' then Exit; if skipspaces(AData, APos) <> '=' then Exit; Inc(apos); V := parseString(AData, APos); fdata[n] := dequote(v); Result := True; end; //------------------------------------------------------------------------------ function TYZXMLTag.ParseXML(var AData: string; var APos: Integer): Boolean; var N: TYZXMLMarker; nxt: Integer; isLong: Boolean; inTag: Boolean; begin isLong := False; Result := False; Clear; if WhatNext(AData, APos, nxt) <> xmlOpenTag then Exit; APos := nxt; if WhatNext(AData, APos, nxt) <> xmlIdentifier then Exit; Result := ParseName(AData, APos); if not Result then Exit; intag := True; Result := False; while True do begin N := WhatNext(AData, APos, nxt, (not intag and islong and (Count > 0))); case N of xmlEOF: Exit; xmlCloseTagLong: begin Result := True; if islong then APos := nxt; if (Text <> '') and (Count > 0) then begin Text := exch(Text, #13#10#13#10, #13#10); end; Exit; end; xmlCloseTagShort: begin Result := (not IsLong) and intag; if Result then APos := nxt; Exit; end; xmlOpenTag: begin if islong then Result := AddChild.ParseXML(AData, APos) else begin Result := False; Exit; end; if not Result then Exit; end; xmlCloseTag: begin IsLong := True; APos := nxt; intag := False; end; xmlIdentifier: begin if intag then parsevalue(AData, APos) else Text := Text + ParseString(AData, APos, True) end; xmlUnknown: begin Result := True; Exit; end; end; end; end; //------------------------------------------------------------------------------ procedure TYZXMLTag.SetName(const Value: string); begin FName := Value; end; //------------------------------------------------------------------------------ procedure TYZXMLTag.SetValue(AName: string; const Value: string); begin FData[AName] := Value; end; //------------------------------------------------------------------------------ function TYZXMLTag.SkipSpaces(var AData: string; var APos: Integer; RememberBreaks: Boolean = False): Char; var L: Integer; P: Char; begin L := Length(AData); while APos <= L do begin P := AData[APos]; if Ord(p) > 32 then begin Result := p; Exit; end else if rememberbreaks then begin if Pos(p, #13#9' ') > 0 then Text := Text + ' '; end; Inc(APos); end; Result := #0; end; //------------------------------------------------------------------------------ function TYZXMLTag.CharAt(var S: string; APos: Integer): Char; begin Result := #0; if (Length(s) < APos) or (apos < 1) then Exit; Result := s[APos]; end; //------------------------------------------------------------------------------ function TYZXMLTag.WhatNext(var AData: string; var APos: Integer; var ANext: Integer; RememberBreaks: Boolean = False): TYZXMLMarker; var s: string; C: Char; P: Integer; begin Result := xmlEOF; P := APos; C := SkipSpaces(AData, APos); P := APos; ANext := P; if C = #0 then Exit; if C = '<' then if CharAt(AData, P + 1) = '/' then begin Inc(P, 2); s := parsestring(AData, P); if (uppercase(s) = uppercase(FName)) and (SkipSpaces(AData, P) = '>') then begin ANext := P + 1; Result := xmlCloseTagLong; Exit; end else begin if TagNameExists(s) then begin Result := xmlCloseTagLong; ANext := APos; Exit; end; ANext := P + 1; Result := xmlCloseTagLong; Exit; end; end; if C = '<' then begin ANext := P + 1; Result := xmlOpenTag; Exit; end; if C = '>' then begin ANext := P + 1; Result := xmlCloseTag; Exit; end; if C = '/' then if CharAt(AData, P + 1) = '>' then begin ANext := P + 2; Result := xmlCloseTagShort; Exit; end; ANext := P; parsestring(AData, ANext); Result := xmlIdentifier; end; //------------------------------------------------------------------------------ function TYZXMLTag.GetValueNames: strarr; begin Result := FData.Keys; end; //------------------------------------------------------------------------------ function TYZXMLTag.GenerateXML(var AData: string; ATab: string = ''): Boolean; var valDelimiter: string; spc: string; i: Integer; a: strarr; begin spc := ATab + #9; if FData.Count < 5 then valDelimiter := ' ' else valDelimiter := #13#10 + spc; AData := AData + #13#10 + ATab + '<' + FName; a := FData.keys; for i := 0 to Length(a) - 1 do begin AData := AData + valDelimiter + a[i] + ' = "' + EnQuote(values[a[i]]) + '"'; end; if (Count > 0) or (Text <> '') then begin AData := AData + '>' + Text; for i := 0 to Count - 1 do begin Children[i].GenerateXML(AData, ATab + #9); end; AData := AData + #13#10 + ATab + '</' + FName + '>'; end else AData := AData + '/>'; Result := True; end; //------------------------------------------------------------------------------ function TYZXMLTag.TagNameExists(AName: string): Boolean; begin Result := AnsiUpperCase(AName) = AnsiUpperCase(Self.FName); if Self.FParent = nil then Exit; if not Result then Result := fparent.TagNameExists(AName); end; //============================================================================== { TYZXMLParser } constructor TYZXMLParser.Create; begin Header := TYZHash.Create; inherited Create(nil); end; //------------------------------------------------------------------------------ destructor TYZXMLParser.Destroy; begin inherited; Header.Destroy; end; //------------------------------------------------------------------------------ procedure TYZXMLParser.BuildTreeView(ATreeView: TTreeView); var i: Integer; begin // clear; ATreeView.Items.Clear; for i := 0 to Count - 1 do _BuildTreeView(ATreeView, nil, children[i]); end; //------------------------------------------------------------------------------ procedure TYZXMLParser._BuildTreeView(ATreeView: TTreeView; ANode: TTreeNode; ATag: TYZXMLTag); var i: Integer; N: TTreeNode; begin N := ATreeView.Items.AddChildObject(ANode, ATag.Name + ' ' + FData['ID'], Pointer(ATag)); for i := 0 to ATag.Count - 1 do begin if ATag.children[i] <> nil then _BuildTreeView(ATreeView, N, ATag.children[i]) else ATreeView.Items.AddChild(N, 'nil'); end; N.Expanded := True; end; //------------------------------------------------------------------------------ function TYZXMLParser.Parse(AData: string): Boolean; var x1, x2, X, i: Integer; s: string; tmp: TYZXMLTag; a: strarr; N: TYZXMLMarker; begin X := 1; Self.SkipSpaces(AData, X); x2 := -1; Result := False; Clear; Header.Clear; x1 := Pos('<?', AData); if x1 >= X then begin x2 := Pos('?>', AData); if x2 < X then Exit; s := uppercase(Copy(AData, x1 + 2, 4)); if Pos('XML ', s) <> 1 then Exit; s := '<xml ' + Copy(AData, x1 + 6, x2 - x1 - 6) + '/>'; tmp := TYZXMLTag.Create(nil); tmp.ParseXML(s, x); a := tmp.ValueNames; for i := 0 to Length(a) - 1 do Header[a[i]] := tmp.Values[a[i]]; tmp.Destroy; x := x2 + 2; end; Result := True; repeat N := whatnext(AData, X, x1); case N of xmlOpenTag: Result := Result and AddChild.ParseXML(AData, X); xmlIdentifier: begin if Text <> '' then Text := Text + ' '; Text := Text + parsestring(AData, X, True); end; else Parsestring(AData, X); end; until skipspaces(adata, x) = #0; // if not result then ShowMessage('Error Parsing: '+inttostr(X)); end; function TYZXMLParser.Generate(var AData: string): Boolean; var i: Integer; a: strarr; begin Header['Date'] := DateTimeToStr(now); a := header.Keys; AData := '<?xml'; for i := 0 to Length(a) - 1 do AData := AData + ' ' + a[i] + '="' + Header[a[i]] + '"'; AData := AData + '?>'#13#10 + Text; Result := True; for i := 0 to Length(children) - 1 do begin Result := Result and children[i].generatexml(AData); end; end; //============================================================================== // procedures of THash class //============================================================================== {THASH CLASS} procedure THash.Clear; begin SetLength(Arr, 0); end; constructor THash.Create; begin inherited; Clear; end; //------------------------------------------------------------------------------ destructor THash.Destroy; begin Clear; inherited; end; //------------------------------------------------------------------------------ function THash.GetCount: Integer; begin Result := Length(Arr); end; //------------------------------------------------------------------------------ function THash.Getempty: Boolean; begin Result := Length(Arr) = 0; end; function THash.GetKeys: StrArr; var i: Integer; begin SetLength(Result, Length(arr)); for i := 0 to Length(Result) - 1 do Result[i] := arr[i].Key; end; //------------------------------------------------------------------------------ function THash.GetValue(Key: string): string; var i: Integer; r: Boolean; begin Result := ''; i := 0; r := False; while (i < Length(Arr)) and (not r) do begin if AnsiUpperCase(arr[i].key) = AnsiUpperCase(Key) then begin Result := Arr[i].Value; r := True; end; i := i + 1; end; end; //------------------------------------------------------------------------------ function THash.GetValues: StrArr; var i: Integer; begin SetLength(Result, Length(arr)); for i := 0 to Length(Result) - 1 do Result[i] := arr[i].Value; end; //------------------------------------------------------------------------------ procedure THash.SetValue(Key: string; const VValue: string); var i, j: Integer; r: Boolean; E: THashElementArr; begin if VValue <> '' then begin i := 0; r := False; while (i < Length(Arr)) and not r do begin if AnsiUpperCase(arr[i].key) = AnsiUpperCase(Key) then begin Arr[i].Value := VValue; r := True; end; i := i + 1; end; if not r then begin SetLength(Arr, Length(arr) + 1); arr[Length(arr) - 1].Key := Key; arr[Length(arr) - 1].Value := Vvalue; end; end; SetLength(E, Length(Arr)); for i := 0 to Length(arr) - 1 do E[i] := Arr[i]; SetLength(arr, 0); for i := 0 to Length(E) - 1 do if (E[i].Key <> '') and (E[i].Value <> '') then begin j := Length(arr); setlength(arr, j + 1); arr[j] := E[i]; end; end; end.