Appendix A

Version 1.0 of the XML DTD describing text files in the ADS Abstract Service.
Document Type Definition for the ADS 
bibliographic records

Syntax policy
=============
 - The element names are in uppercase in order
   to help the reading.
 - The attribute names are preferably in 
   lowercase 
 - The attribute values are allowed to be of 
   type CDATA to allow more flexibility for 
   additional values; however, attributes
   typically may only assume one of a well-
   defined set of values
 - Cross-referencing among elements such as 
   AU, AF, and EM is accomplished through the 
   use of attributes of type IDREFS (for AU) 
   and ID (for AF and EM)

<!-- BIBRECORD is the root element of the XML 
     document.  Attributes are:

   origin  mnemonic indicating individual(s)
           or institution(s) who submitted 
           the record to ADS
   lang    language in which the contents of 
           this record are expressed the 
           possible values are language tags 
           as defined in RFC 1766.  
           Examples: lang="fr",  lang="en"
-->

<!ELEMENT BIBRECORD ( METADATA?, 
                      TITLE?, 
                      AUTHORS?, 
                      AFFILIATIONS?, 
                      EMAILS?, 
                      FOOTNOTES?, 
                      BIBCODE, 
                      MSTRING, 
                      MONOGRAPH?, 
                      SERIES?, 
                      PAGE?, 
                      LPAGE?, 
                      COPYRIGHT?, 
                      PUBDATE, 
                      CATEGORIES*, 
                      COMMENTS*, 
                      ANOTE?, 
                      BIBTYPE?, 
                      IDENTIFIERS?, 
                      ORIGINS, 
                      OBJECTS*, 
                      KEYWORDS*, 
                      ABSTRACT* ) >

<!ATTLIST BIBRECORD  origin CDATA   #REQUIRED
                     lang   CDATA   #IMPLIED  >

<!-- Generic metadata about the ADS record 
     (rather than the publication) -->
<!ELEMENT METADATA ( VERSION, 
                     CREATOR, 
                     CDATE, 
                     EDATE ) >

<!-- Versioning is introduced to allow parsers 
     to detect and reject any documents not 
     complying with the supported DTD  -->
<!ELEMENT VERSION ( #PCDATA ) >
<!-- CREATOR is purely informative -->
<!ELEMENT CREATOR ( #PCDATA ) >
<!-- Creation date for the record -->
<!ELEMENT CDATE ( YYYY-MM-DD ) >
<!-- Last modified date -->
<!ELEMENT EDATE ( YYYY-MM-DD ) >

<!-- Title of the publication -->
<!ELEMENT TITLE ( #PCDATA ) >
<!ATTLIST TITLE lang CDATA #IMPLIED >

<!-- AUTHORS contains only AU subelements, each 
     one of them corresponding to a single author 
     name -->
<!ELEMENT AUTHORS ( AU+ ) >

<!--  AU contains at least the person's last name
      (LNAME), and possibly the first and middle 
      name(s) (or just the initials) which would 
      be stored in element FNAME.  PREF and SUFF 
      represent the salutation and suffix for the
      name. SUFF typically is one of: Jr., Sr., 
      II, III, IV. PREF is rarely used but is 
      here for completeness. Typically we would 
      store salutations such as "Rev."
      (for "Reverend"), or "Prof." (for  
      "Professor") in this element.

-->
<!ELEMENT AU ( PREF?, 
               FNAME?, 
               LNAME, 
               SUFF? ) >
<!-- The attributes AF and EM are used to cross-
                 reference author affiliations and email 
     addresses with the individual author records.  
     This is the only exception of attributes in 
     upper case.  The typical use of this is:
     <AU AF="AF_1 AF_2" EM="EM_3">...</AU>
-->
<!ATTLIST AU     AF     IDREFS  #IMPLIED
                 EM     IDREFS  #IMPLIED
                 FN     IDREFS  #IMPLIED >
<!-- AU subelements -->
<!ELEMENT PREF  ( #PCDATA ) >
<!ELEMENT FNAME ( #PCDATA ) >
<!ELEMENT LNAME ( #PCDATA ) >
<!ELEMENT SUFF  ( #PCDATA ) >

<!-- AFFILIATIONS is the wrapper element for 
     the individual affiliation records, each 
     represented as an AF element -->
<!ELEMENT AFFILIATIONS ( AF+ ) >
<!ELEMENT AF ( #PCDATA ) >
<!-- the value of the ident attribute should 
     match one of the values assumed by the AF 
     attribute in an AU element -->
<!ATTLIST AF         ident  ID      #REQUIRED >

<!ELEMENT EMAILS ( EM+ ) >
<!ELEMENT EM ( #PCDATA ) >
<!-- the value of the ident attribute should 
     match one of the values assumed by the EM 
     attribute in an AU element -->
<!ATTLIST EM         ident  ID      #REQUIRED >

<!-- FOOTNOTES and FN subelements are here for 
     future use -->
<!ELEMENT FOOTNOTES ( FN+ ) >
<!ELEMENT FN ( #PCDATA ) >
<!ATTLIST FN         ident  ID      #REQUIRED >

<!-- BIBCODE; for a definition, see:
http://adsdoc.harvard.edu/abs_doc/bib_help.html
http://adsabs.harvard.edu/cgi-bin/
       nph-bib_query?1995ioda.book..259S
http://adsabs.harvard.edu/cgi-bin/
       nph-bib_query?1995VA.....39R.272S
     This identifier logically belongs to the 
     IDENTS element, but since it is the 
     identifier used internally in the system, 
     it is important to have it in a prominent 
     and easy to reach place.
-->
<!ELEMENT BIBCODE ( #PCDATA ) >

<!-- MSTRING is the unformatted string for the 
    monograph (article, book, whatever).  Example:
    <MSTRING>The Astrophysical Journal, Vol. 526, 
     n. 2, pp. L89-L92</MSTRING>
-->
<!ELEMENT MSTRING ( #PCDATA ) >
<!-- MONOGRAPH is a structured record containing 
     the fielded information about the monograph 
     where the bibliographic entry appeared.  
     Typically this is created by parsing the 
     text in the MSTRING element.  Example:
      <MTITLE>The Astrophysical Journal</MTITLE>
      <VOLUME>526</VOLUME>
      <ISSUE>2</ISSUE>
      <PUBLISHER>University of Chicago Press
         </PUBLISHER>
-->
<!ELEMENT MONOGRAPH ( MTITLE, 
                      VOLUME?, 
                      ISSUE?, 
                      MNOTE?, 
                      EDITORS?, 
                      EDITION?, 
                      PUBLISHER?, 
                      LOCATION?, 
                      MID* ) >

<!-- Monograph title (e.g. "Astrophysical Journal")
      -->
<!ELEMENT MTITLE ( #PCDATA ) >
<!ELEMENT VOLUME ( #PCDATA ) >
<!ATTLIST VOLUME     type   NMTOKEN #IMPLIED  >
<!ELEMENT ISSUE ( #PCDATA ) >
<!-- A note about the monograph as supplied by the 
     publisher or editor -->
<!ELEMENT MNOTE ( #PCDATA ) >
<!-- List of editor names as extracted from MSTRING.
  Formatting is as for AUTHORS and AU elements -->
<!ELEMENT EDITORS ( ED+ ) >
<!ELEMENT ED ( PREF?, 
               FNAME?,
               LNAME,
               SUFF? ) >
<!-- Edition of publication -->
<!ELEMENT EDITION ( #PCDATA ) >
<!-- Name of publisher -->
<!ELEMENT PUBLISHER ( #PCDATA ) >
<!-- Place of publication -->
<!ELEMENT LOCATION ( #PCDATA ) >
<!-- MID represents the monograph identification as 
     supplied by the publisher.  This may be useful 
     in correlating our record with the publisher's 
     online offerings.  The "system" attribute 
     characterizes the system used to express the 
     identifier -->
<!ELEMENT MID ( #PCDATA ) >
<!ATTLIST MID        type   NMTOKEN #IMPLIED  >

<!-- If the bibliographic entry appeared in a series, 
     then the element SERIES contains information 
     about the series itself.  Typically this 
     consists of data about a conference series 
     (e.g. ASP Conference Series).  Note that 
     there may be several SERIES elements, since  
     some publications belong to "subseries" within 
     a series.
-->
<!ELEMENT SERIES ( SERTITLE,
                   SERVOL?,
                   SEREDITORS?,
                   SERBIBCODE? ) >
<!-- Title, volume, and editors of conference 
     series -->
<!ELEMENT SERTITLE ( #PCDATA ) >
<!ELEMENT SERVOL ( #PCDATA ) >
<!ELEMENT SEREDITORS ( ED+ ) >
<!-- Serial bibcode for publication (may coincide 
     with main bibcode) -->
<!ELEMENT SERBIBCODE ( #PCDATA ) >

<!-- PAGE may have the attribute type set to 
     "s" for (sequential) the value associated 
     to it does not represent a printed volume 
     number -->
<!ELEMENT PAGE ( #PCDATA ) >
<!ATTLIST PAGE       type   NMTOKEN #IMPLIED  >

<!-- LPAGE gives the last page number (if known).
     Does not make sense if PAGE is type="s" -->
<!ELEMENT LPAGE ( #PCDATA ) >

<!-- COPYRIGHT is just an unformatted string 
     containing copyright information from 
     publisher -->
<!ELEMENT COPYRIGHT ( #PCDATA ) >

<!ELEMENT PUBDATE ( YEAR, MONTH? ) >
<!ELEMENT MONTH ( #PCDATA ) >
<!ELEMENT YEAR ( #PCDATA ) >

<!-- CATEGORIES contain subelements indicating in 
     which subject categories the publication was 
     assigned.  STI/RECON has always assigned a 
     category for each entry in their system, but 
     otherwise there is little else in our 
     database.  The attributes origin and system 
     are used to keep track of the different 
     classifications used.
-->
<!ELEMENT CATEGORIES ( CA+ ) >
<!ATTLIST CATEGORIES origin NMTOKEN #IMPLIED
                     system NMTOKEN #IMPLIED  >
<!ELEMENT CA ( #PCDATA ) >

<!-- Typically private fields supplied by the 
     data source.  For instance, SIMBAD and LOC 
     provide comments about a bibliographic 
     entries -->
<!ELEMENT COMMENTS ( CO+ ) >
<!ATTLIST COMMENTS lang   CDATA   #IMPLIED
                   origin NMTOKEN #IMPLIED >
<!ELEMENT CO ( #PCDATA ) >

<!-- Author note -->
<!ELEMENT ANOTE ( #PCDATA ) >

<!-- BIBTYPE describes what type of publication 
     this entry corresponds to.  This is 
     currently limited to the following tokens 
     (taken straight from the BibTeX 
     classification):
          article
          book
          booklet
          inbook
          incollection
          inproceedings
          manual
          masterthesis
          misc
          phdthesis
          proceedings
          techreport
          unpublished
-->
<!ELEMENT BIBTYPE ( #PCDATA ) >

<!-- List of all known identifiers for this 
     publication -->
<!ELEMENT IDENTIFIERS ( ID+ ) >
<!-- Contents of an ID element is the identifier 
     used by a particular publisher or institution.
     Examples:
       <ID origin="UCP" system="PUBID">38426</ID>
       <ID origin="STI" system="ACCNO">A90-12345</ID>
-->
<!ELEMENT ID ( #PCDATA ) >
<!ATTLIST ID         origin NMTOKEN #IMPLIED
                     type   NMTOKEN #REQUIRED >

<!-- the collective list of institutions that have 
     given us a record about this entry.  -->
<!ELEMENT ORIGINS ( OR+ ) >
<!ELEMENT OR ( #PCDATA ) >

<!-- The list of objects associated with the 
     publication -->
<!ELEMENT OBJECTS ( OB+ ) >
<!ELEMENT OB ( #PCDATA ) >

<!-- Keywords assigned to the publication -->
<!ELEMENT KEYWORDS ( KW+ ) >
<!ATTLIST KEYWORDS   Lang   CDATA   #IMPLIED
                     origin NMTOKEN #IMPLIED
                     system NMTOKEN #REQUIRED >
<!ELEMENT KW ( #PCDATA ) >

<!-- An abstract of the publication.  This is 
     typically provided to us by the publisher, 
     but may in some cases come from other 
     sources (E.g. STI, which keyed abstracts 
     in most cases).  Therefore we allow several 
     ABSTRACT elements within each record, each 
     with a separate origin or language.  
     The attribute type is used to keep track 
     of how the abstract data was generated.  
     For instance, abstract text generated by 
     our OCR software will have: 
         origin="ADS" type="OCR" lang="en"
-->
<!ELEMENT ABSTRACT ( P+ ) >
<!ATTLIST ABSTRACT   origin NMTOKEN #IMPLIED  >
                     type   NMTOKEN #IMPLIED  >
                     lang   CDATA   #IMPLIED  >

<!-- Abstracts are composed of separate 
     paragraphs which have mixed contents as 
     listed below.  All the subelements listed 
     below have the familiar HTML meaning and 
     are used to render the abstract text in a 
     decent way -->
<!ELEMENT P (#PCDATA |A| BR | PRE | SUP | SUB)* >
<!-- Line breaks (BR) and preformatted text (PRE) 
     make it possible to display tables and other 
     preformatted text. -->
<!ELEMENT BR EMPTY >
<!ELEMENT PRE  (#PCDATA | A | BR | SUP | SUB )* >
<!-- A is the familiar anchor element.  -->
<!ELEMENT A ( #PCDATA | BR | SUP | SUB )* >
<!ATTLIST A          HREF   CDATA   #REQUIRED >
<!-- SUP and SUB are superscripts and subscripts.  
     In our content model, they are allowed to 
     contain additional SUP and SUB elements, 
     although we may decide to restrict them to
     PCDATA at some point -->
<!ELEMENT SUP ( #PCDATA | A | BR | SUP | SUB )* >
<!ELEMENT SUB ( #PCDATA | A | BR | SUP | SUB )* >
Up: The NASA Astrophysics Data