View Javadoc

1   package org.robsite.extension.rss.model;
2   
3   
4   import java.io.IOException;
5   
6   import java.net.HttpURLConnection;
7   import java.net.MalformedURLException;
8   import java.net.URL;
9   
10  import java.net.URLConnection;
11  import java.text.DateFormat;
12  import java.text.SimpleDateFormat;
13  import java.util.ArrayList;
14  import java.util.Date;
15  import java.util.HashMap;
16  import java.util.Iterator;
17  import java.util.Map;
18  
19  import java.util.Stack;
20  import java.util.TimeZone;
21  import javax.mail.internet.MailDateFormat;
22  import javax.xml.parsers.ParserConfigurationException;
23  import javax.xml.parsers.SAXParser;
24  import javax.xml.parsers.SAXParserFactory;
25  
26  import org.xml.sax.Attributes;
27  import org.xml.sax.InputSource;
28  import org.xml.sax.SAXException;
29  import org.xml.sax.XMLReader;
30  import org.xml.sax.helpers.DefaultHandler;
31  
32  /***
33   * This is a simple RSS parser. 
34   * 
35   * @author rcleveng@users.sourceforge.net
36   * @author brian_duff@users.sourceforge.net
37   */
38  public class SimpleRSSParser extends DefaultHandler
39  {
40    private static final String CONTEXT_ITEM = "item";
41    private static final String CONTEXT_TEXT = "text";
42    
43    private static final String DUBLIN_CORE_URI = 
44      "http://purl.org/dc/elements/1.1/";
45    
46    private Channel _channel;
47    private final RSSTagContext _context = new RSSTagContext();
48    private final Map _handlers = new HashMap();
49    
50    /***
51     * Defines the interface for a tag handler.
52     */
53    private abstract class RSSTagHandler
54    {
55      public void handleStartElement( RSSTagContext context ) {}
56      public void handleEndElement( RSSTagContext context  ) {}
57    }
58    
59    /***
60     * The context passed to tag handlers.
61     */
62    private class RSSTagContext
63    {
64      private Map _contextValues = new HashMap();
65      private Attributes _attributes;
66      private String _uri;
67      private String _name;
68      private String _qname;
69      
70      private StringBuffer _text;
71      
72      /***
73       * Gives the path to the current element.
74       */
75      private Stack _elementPath = new Stack();
76      
77      
78      /***
79       * Get the XPath to the current node. 
80       * 
81       * @return an xpath location identifying the current node's position in
82       * the XML document.
83       */
84      public String getPath()
85      {
86        StringBuffer pathBuffer = new StringBuffer();
87        ArrayList path = new ArrayList( _elementPath );
88        pathBuffer.append( "/" );
89        for ( Iterator i = path.iterator(); i.hasNext(); )
90        {
91          pathBuffer.append( String.valueOf( i.next() ) );
92          pathBuffer.append( "/" );
93        }
94        
95        return pathBuffer.toString();
96      }
97      
98      /***
99       * Get the local name of the parent element, if any. 
100      * 
101      * @return the local name of the parent element, or the empty string if this
102      *    element is the root.
103      */
104     public String getParentElement()
105     {
106       if ( !_elementPath.isEmpty() )
107       {
108         return (String)_elementPath.peek();
109       }
110       return "";
111     }
112     
113     public void pushPathElement( String name )
114     {
115       _elementPath.push( name );
116     }
117     
118     public void popPathElement()
119     {
120       _elementPath.pop();
121     }
122     
123     
124     public String getText()
125     {
126       if ( _text == null ) return null;
127       
128       return _text.toString();
129     }
130     
131     public void setAttributes( Attributes attributes )
132     {
133       _attributes = attributes;
134     }
135     
136     public Attributes getAttributes()
137     {
138       return _attributes;
139     }
140     
141     public void setURI( String uri )
142     {
143       _uri = uri;
144     }
145     
146     public String getURI()
147     {
148       return _uri;
149     }
150     
151     public void setName( String name )
152     {
153       _name = name;
154     }
155     
156     public String getName()
157     {
158       return _name;
159     }
160     
161     public void setQName( String qname )
162     {
163       _qname = qname;
164     }
165     
166     public String getQName()
167     {
168       return _qname;
169     }
170     
171     public Object getValue( Object key )
172     {
173       return _contextValues.get( key );
174     }
175     
176     public void putValue( Object key, Object value )
177     {
178       _contextValues.put( key, value );
179     }
180   }
181   
182   /***
183    * Construct the rss parser for the specified channel.
184    * 
185    * @param channel
186    */
187   private SimpleRSSParser( Channel channel )
188   {
189     if ( channel != null )
190     {
191       _channel = channel;
192     }
193     else
194     {
195       _channel  = new Channel();
196     }
197     
198     registerDefaultHandlers();
199   }
200 
201 
202   public static Channel parse( String urlString ) throws MalformedURLException, 
203     SAXException, ParserConfigurationException, IOException
204   {
205     return parse( urlString, null );
206   }
207 
208 
209   public static void parse( Channel channel ) throws MalformedURLException, 
210     SAXException, ParserConfigurationException, IOException
211   {
212     parse( channel.getURL(), channel );
213   }
214 
215 
216   public static Channel parse( String urlString, Channel channel ) 
217     throws MalformedURLException, SAXException, IOException, 
218           ParserConfigurationException
219   {
220     if ( channel == null )
221     {
222       channel = new Channel();
223     }
224   
225     // If this fails, it'll throw a MalformedURLException
226     URL url = new URL( urlString );
227     
228     URLConnection conn = url.openConnection();
229     // Implement conditional HTTP get...
230     if ( conn instanceof HttpURLConnection )
231     {
232       HttpURLConnection connection = (HttpURLConnection) conn;
233 
234       if ( channel.getHTTPLastModified() != null )
235       {
236         connection.setRequestProperty(
237           "If-Modified-Since",
238           channel.getHTTPLastModified()
239         );
240       } 
241       if ( channel.getHTTPEtag() != null )
242       {
243         connection.setRequestProperty(
244           "If-None-Match",
245           channel.getHTTPEtag()
246         );
247       }
248       
249       connection.connect();
250       
251       int responseCode = connection.getResponseCode();
252       
253       // The rss feed has not been modified.
254       if ( responseCode == 304 )
255       {
256         connection.disconnect();
257         return channel;
258       }
259       else if ( responseCode == 200 ) // OK
260       {
261         String lastModified = connection.getHeaderField( "Last-Modified" );
262         String etag = connection.getHeaderField( "ETag" );
263         channel.setHTTPLastModified( lastModified );
264         channel.setHTTPEtag( etag );
265       }
266       else
267       {
268         throw new IOException(
269           "Could not connect to "+urlString+": "+
270           responseCode + " " + connection.getResponseMessage()
271         );
272       }
273     }
274     
275 
276 
277     Channel result = null;
278     result = parse( new InputSource( conn.getInputStream() ), channel );
279     result.setURL( urlString );
280     result.setOpen( true );
281     return result;
282   }
283 
284 
285   private static Channel parse( InputSource inputSource, Channel channel ) 
286     throws SAXException, IOException, ParserConfigurationException
287   {
288     SAXParserFactory factory = SAXParserFactory.newInstance();
289     SAXParser parser = factory.newSAXParser();
290     XMLReader reader = parser.getXMLReader();
291     SimpleRSSParser handler = new SimpleRSSParser( channel );
292     reader.setContentHandler( handler );
293     reader.setErrorHandler( handler );
294 
295     reader.parse( inputSource  );
296     
297     return handler.getChannel();
298   }
299 
300   public Channel getChannel()
301   {
302     return _channel;
303   }
304   
305   private void registerHandler( String elementName, RSSTagHandler handler )
306   {
307     _handlers.put( elementName, handler );
308   }
309   
310   private RSSTagHandler lookupHandler( String elementName )
311   {
312     return (RSSTagHandler) _handlers.get( elementName );
313   }
314 
315   private void registerDefaultHandlers()
316   {
317     registerHandler( "item", new RSSTagHandler() 
318     {
319       public void handleStartElement( RSSTagContext context )
320       {
321         context.putValue( CONTEXT_ITEM, new Item() );
322       }
323       
324       public void handleEndElement( RSSTagContext context )
325       {
326         _channel.addItem( 
327           (Item) context.getValue( CONTEXT_ITEM ),
328           false
329         );
330         context.putValue( CONTEXT_ITEM, null );
331       }
332     } );
333     
334     registerHandler( "description", new RSSTagHandler() 
335     {
336       public void handleEndElement( RSSTagContext context )
337       {
338         Item item = (Item) context.getValue( CONTEXT_ITEM );
339         if ( item == null )
340         {
341           _channel.setDescription( context.getText() );
342         }
343         else
344         {
345           item.setDescription( context.getText() );
346         }
347       }
348     });
349     
350     registerHandler( "link", new RSSTagHandler() 
351     {
352       public void handleEndElement( RSSTagContext context )
353       {
354         Item item = (Item) context.getValue( CONTEXT_ITEM );
355         if ( item == null )
356         {
357           _channel.setLink( context.getText() );
358         }
359         else
360         {
361           item.setLink( context.getText() );
362         }
363       }
364     } );
365     
366     registerHandler( "title", new RSSTagHandler() 
367     {
368       public void handleEndElement( RSSTagContext context )
369       {
370         Item item = (Item) context.getValue( CONTEXT_ITEM );
371         // Check that channel is the parent element. Some feeds (e.g. OTN)
372         // use <channel><title>..</title><image><title>...</title></image></channel>
373         if ( item == null && "channel".equals( context.getParentElement() ))
374         {
375           _channel.setTitle( context.getText() );
376         }
377         else if ( item != null )
378         {
379           item.setTitle( context.getText() );
380         }
381       }
382     } );
383     
384     registerHandler( "pubDate", new RSSTagHandler() 
385     {
386       public void handleEndElement( RSSTagContext context )
387       {
388         Item item = (Item) context.getValue( CONTEXT_ITEM );
389         if ( item != null )
390         {
391           Date d = parseDate( context.getText() );
392           if ( d != null )
393           {
394             item.setPublishDate( d );
395           }
396         }
397       }
398     } );
399     
400     registerHandler( "guid", new RSSTagHandler()
401     {
402       public void handleEndElement( RSSTagContext context )
403       {
404         Item item = (Item) context.getValue( CONTEXT_ITEM );
405         if ( item != null )
406         {
407           item.setGuid( context.getText() );
408         }
409       }
410     } );
411     
412     // Dublin Core dates
413     registerHandler( "date", new RSSTagHandler()
414     {
415       public void handleEndElement( RSSTagContext context )
416       {
417         Item item = (Item) context.getValue( CONTEXT_ITEM );
418         if ( item != null ) 
419         {
420           if ( DUBLIN_CORE_URI.equals( context.getURI() ) )
421           {
422             Date date = parseDate( _context.getText() );
423             if ( date != null )
424             {
425               item.setPublishDate( date );
426             }
427           }
428         }
429       }
430     } );
431   }
432   
433   
434   /***
435    * Attempt to parse a date. Dates in RSS are pretty non-standard, 
436    * unfortunately. The RSS spec calls for RFC822 dates, but in practice, 
437    * this isn't followed by all feed generators. Nothing like standards ;)
438    * 
439    * @param date
440    * @return 
441    */
442   private Date parseDate( String date )
443   {
444     Date d = parseRFC822Date( date );
445     if ( d == null )
446     {
447       d = parseISODate( date );
448     }
449     
450     return d;
451   }
452   
453   private Date parseISODate( String date )
454   {
455     try
456     {
457       // Check the 6th character from the end. If this is + or -, we have 
458       // a specific timezone. Otherwise, it should be 'Z', representing UTC
459       String formatString = null;
460       TimeZone timeZone = null;
461       
462       // Is it an ISO date without a time?
463       if ( date.indexOf( 'T' ) == -1 )
464       {
465         formatString = "yyyy-MM-dd";
466       }
467       else
468       {
469         if ( date.length() >= 7 )
470         {        
471           char tzChar = date.charAt( date.length() - 6 );
472           if ( tzChar == 'Z' )
473           {
474             date = date.substring( 0, date.length() - 6 );
475           }
476           else if ( tzChar == '+' || tzChar == '-' )
477           {
478             String tzId = "GMT" + date.substring( date.length() - 6 );
479             timeZone = TimeZone.getTimeZone( tzId );
480             date = date.substring( 0, date.length() - 6 );
481           }
482           
483           // If the date contains a comma, it has milliseconds.
484           if ( date.indexOf( "," ) >= 0 )
485           {
486             formatString = "yyyy-MM-dd'T'HH:mm:ss,SSS";
487           }
488           else
489           {
490             formatString = "yyyy-MM-dd'T'HH:mm:ss";
491           }
492         }
493       }      
494       
495       if ( formatString == null )
496       {
497         throw new Exception( "Date does not parse" );
498       }
499         
500       // Parse a date in the ISO8601 format
501       DateFormat f = new SimpleDateFormat( formatString );
502       if ( timeZone == null )
503       {
504         timeZone = TimeZone.getTimeZone( "UTC" );
505       }
506       f.setTimeZone( timeZone );
507       
508       return f.parse( date );
509     }
510     catch ( Exception e )
511     {
512       return null;
513     }
514   }
515   
516   private Date parseRFC822Date( String date )
517   {
518     try
519     {
520       MailDateFormat mdf = new MailDateFormat();
521       return mdf.parse( date );
522     }
523     catch ( Exception e )
524     {
525       return null;
526     }
527   }
528 
529   public void startElement (String uri, String name, String qName, Attributes atts)
530   {
531     
532     _context._text = new StringBuffer();
533     _context.setURI( uri );
534     _context.setName( name );
535     _context.setQName( qName );
536     _context.setAttributes( atts );
537 
538     RSSTagHandler handler = lookupHandler( name );
539     if ( handler != null )
540     {
541       handler.handleStartElement( _context );
542     }
543     
544     _context.pushPathElement( name );
545   }
546   
547   public void endElement (String uri, String name, String qName)
548   {
549     _context.setURI( uri );
550     _context.setName( name );
551     _context.setQName( qName );
552     // This shouldn't be accessible from handleEndElement.
553     _context.setAttributes( null );
554   
555     RSSTagHandler handler = lookupHandler( name );
556     _context.popPathElement();    
557     if ( handler != null )
558     {
559       handler.handleEndElement( _context );
560     }
561     
562     _context._text = null;
563   }
564 
565 
566   public void characters (char ch[], int start, int length)
567   {
568     if ( _context._text != null )
569     {
570       _context._text.append( new String( ch, start, length ) );
571     }
572   }
573 
574 
575 
576   public static void main(String[] args)
577   {
578     try
579     {
580       Channel channel = null;
581       System.out.println( "*** BEGIN OTN NEWS ***" );
582       channel = SimpleRSSParser.parse( "http://www.orablogs.com/duffblog/index.xml" );
583       System.out.println( channel.dumpString() );
584       System.out.println( "*** END OTN NEWS ***" );
585       System.out.println( "" );
586       System.out.println( "" );
587 
588 //      System.out.println( "*** BEGIN DIVE INTO BC4J NEWS ***" );
589 //      channel = SimpleRSSParser.parse( "http://radio.weblogs.com/0118231/rss.xml" );
590 //      System.out.println( channel.dumpString() );
591 //      System.out.println( "*** END DIVE INTO BC4J NEWS ***" );
592 //      System.out.println( "" );
593 //      System.out.println( "" );
594 
595       System.out.println( "*** BEGIN ***" );
596       channel = SimpleRSSParser.parse( "http://radio.weblogs.com/0129487/rss.xml" );
597       //System.out.println( channel.dumpString() );
598       System.out.println( channel.dumpString() );
599       System.out.println( "*** END ROBSITE.ORG NEWS ***" );
600       System.out.println( "" );
601       System.out.println( "" );
602     }
603     catch ( Exception ex)
604     {
605       ex.printStackTrace();
606     }
607   }
608   
609 }