1 package org.robsite.extension.rss.model;
2
3
4 import java.io.IOException;
5
6 import java.net.HttpURLConnection;
7 import java.net.MalformedURLException;
8 import java.net.URL;
9
10 import java.net.URLConnection;
11 import java.text.DateFormat;
12 import java.text.SimpleDateFormat;
13 import java.util.ArrayList;
14 import java.util.Date;
15 import java.util.HashMap;
16 import java.util.Iterator;
17 import java.util.Map;
18
19 import java.util.Stack;
20 import java.util.TimeZone;
21 import javax.mail.internet.MailDateFormat;
22 import javax.xml.parsers.ParserConfigurationException;
23 import javax.xml.parsers.SAXParser;
24 import javax.xml.parsers.SAXParserFactory;
25
26 import org.xml.sax.Attributes;
27 import org.xml.sax.InputSource;
28 import org.xml.sax.SAXException;
29 import org.xml.sax.XMLReader;
30 import org.xml.sax.helpers.DefaultHandler;
31
32 /***
33 * This is a simple RSS parser.
34 *
35 * @author rcleveng@users.sourceforge.net
36 * @author brian_duff@users.sourceforge.net
37 */
38 public class SimpleRSSParser extends DefaultHandler
39 {
40 private static final String CONTEXT_ITEM = "item";
41 private static final String CONTEXT_TEXT = "text";
42
43 private static final String DUBLIN_CORE_URI =
44 "http://purl.org/dc/elements/1.1/";
45
46 private Channel _channel;
47 private final RSSTagContext _context = new RSSTagContext();
48 private final Map _handlers = new HashMap();
49
50 /***
51 * Defines the interface for a tag handler.
52 */
53 private abstract class RSSTagHandler
54 {
55 public void handleStartElement( RSSTagContext context ) {}
56 public void handleEndElement( RSSTagContext context ) {}
57 }
58
59 /***
60 * The context passed to tag handlers.
61 */
62 private class RSSTagContext
63 {
64 private Map _contextValues = new HashMap();
65 private Attributes _attributes;
66 private String _uri;
67 private String _name;
68 private String _qname;
69
70 private StringBuffer _text;
71
72 /***
73 * Gives the path to the current element.
74 */
75 private Stack _elementPath = new Stack();
76
77
78 /***
79 * Get the XPath to the current node.
80 *
81 * @return an xpath location identifying the current node's position in
82 * the XML document.
83 */
84 public String getPath()
85 {
86 StringBuffer pathBuffer = new StringBuffer();
87 ArrayList path = new ArrayList( _elementPath );
88 pathBuffer.append( "/" );
89 for ( Iterator i = path.iterator(); i.hasNext(); )
90 {
91 pathBuffer.append( String.valueOf( i.next() ) );
92 pathBuffer.append( "/" );
93 }
94
95 return pathBuffer.toString();
96 }
97
98 /***
99 * Get the local name of the parent element, if any.
100 *
101 * @return the local name of the parent element, or the empty string if this
102 * element is the root.
103 */
104 public String getParentElement()
105 {
106 if ( !_elementPath.isEmpty() )
107 {
108 return (String)_elementPath.peek();
109 }
110 return "";
111 }
112
113 public void pushPathElement( String name )
114 {
115 _elementPath.push( name );
116 }
117
118 public void popPathElement()
119 {
120 _elementPath.pop();
121 }
122
123
124 public String getText()
125 {
126 if ( _text == null ) return null;
127
128 return _text.toString();
129 }
130
131 public void setAttributes( Attributes attributes )
132 {
133 _attributes = attributes;
134 }
135
136 public Attributes getAttributes()
137 {
138 return _attributes;
139 }
140
141 public void setURI( String uri )
142 {
143 _uri = uri;
144 }
145
146 public String getURI()
147 {
148 return _uri;
149 }
150
151 public void setName( String name )
152 {
153 _name = name;
154 }
155
156 public String getName()
157 {
158 return _name;
159 }
160
161 public void setQName( String qname )
162 {
163 _qname = qname;
164 }
165
166 public String getQName()
167 {
168 return _qname;
169 }
170
171 public Object getValue( Object key )
172 {
173 return _contextValues.get( key );
174 }
175
176 public void putValue( Object key, Object value )
177 {
178 _contextValues.put( key, value );
179 }
180 }
181
182 /***
183 * Construct the rss parser for the specified channel.
184 *
185 * @param channel
186 */
187 private SimpleRSSParser( Channel channel )
188 {
189 if ( channel != null )
190 {
191 _channel = channel;
192 }
193 else
194 {
195 _channel = new Channel();
196 }
197
198 registerDefaultHandlers();
199 }
200
201
202 public static Channel parse( String urlString ) throws MalformedURLException,
203 SAXException, ParserConfigurationException, IOException
204 {
205 return parse( urlString, null );
206 }
207
208
209 public static void parse( Channel channel ) throws MalformedURLException,
210 SAXException, ParserConfigurationException, IOException
211 {
212 parse( channel.getURL(), channel );
213 }
214
215
216 public static Channel parse( String urlString, Channel channel )
217 throws MalformedURLException, SAXException, IOException,
218 ParserConfigurationException
219 {
220 if ( channel == null )
221 {
222 channel = new Channel();
223 }
224
225
226 URL url = new URL( urlString );
227
228 URLConnection conn = url.openConnection();
229
230 if ( conn instanceof HttpURLConnection )
231 {
232 HttpURLConnection connection = (HttpURLConnection) conn;
233
234 if ( channel.getHTTPLastModified() != null )
235 {
236 connection.setRequestProperty(
237 "If-Modified-Since",
238 channel.getHTTPLastModified()
239 );
240 }
241 if ( channel.getHTTPEtag() != null )
242 {
243 connection.setRequestProperty(
244 "If-None-Match",
245 channel.getHTTPEtag()
246 );
247 }
248
249 connection.connect();
250
251 int responseCode = connection.getResponseCode();
252
253
254 if ( responseCode == 304 )
255 {
256 connection.disconnect();
257 return channel;
258 }
259 else if ( responseCode == 200 )
260 {
261 String lastModified = connection.getHeaderField( "Last-Modified" );
262 String etag = connection.getHeaderField( "ETag" );
263 channel.setHTTPLastModified( lastModified );
264 channel.setHTTPEtag( etag );
265 }
266 else
267 {
268 throw new IOException(
269 "Could not connect to "+urlString+": "+
270 responseCode + " " + connection.getResponseMessage()
271 );
272 }
273 }
274
275
276
277 Channel result = null;
278 result = parse( new InputSource( conn.getInputStream() ), channel );
279 result.setURL( urlString );
280 result.setOpen( true );
281 return result;
282 }
283
284
285 private static Channel parse( InputSource inputSource, Channel channel )
286 throws SAXException, IOException, ParserConfigurationException
287 {
288 SAXParserFactory factory = SAXParserFactory.newInstance();
289 SAXParser parser = factory.newSAXParser();
290 XMLReader reader = parser.getXMLReader();
291 SimpleRSSParser handler = new SimpleRSSParser( channel );
292 reader.setContentHandler( handler );
293 reader.setErrorHandler( handler );
294
295 reader.parse( inputSource );
296
297 return handler.getChannel();
298 }
299
300 public Channel getChannel()
301 {
302 return _channel;
303 }
304
305 private void registerHandler( String elementName, RSSTagHandler handler )
306 {
307 _handlers.put( elementName, handler );
308 }
309
310 private RSSTagHandler lookupHandler( String elementName )
311 {
312 return (RSSTagHandler) _handlers.get( elementName );
313 }
314
315 private void registerDefaultHandlers()
316 {
317 registerHandler( "item", new RSSTagHandler()
318 {
319 public void handleStartElement( RSSTagContext context )
320 {
321 context.putValue( CONTEXT_ITEM, new Item() );
322 }
323
324 public void handleEndElement( RSSTagContext context )
325 {
326 _channel.addItem(
327 (Item) context.getValue( CONTEXT_ITEM ),
328 false
329 );
330 context.putValue( CONTEXT_ITEM, null );
331 }
332 } );
333
334 registerHandler( "description", new RSSTagHandler()
335 {
336 public void handleEndElement( RSSTagContext context )
337 {
338 Item item = (Item) context.getValue( CONTEXT_ITEM );
339 if ( item == null )
340 {
341 _channel.setDescription( context.getText() );
342 }
343 else
344 {
345 item.setDescription( context.getText() );
346 }
347 }
348 });
349
350 registerHandler( "link", new RSSTagHandler()
351 {
352 public void handleEndElement( RSSTagContext context )
353 {
354 Item item = (Item) context.getValue( CONTEXT_ITEM );
355 if ( item == null )
356 {
357 _channel.setLink( context.getText() );
358 }
359 else
360 {
361 item.setLink( context.getText() );
362 }
363 }
364 } );
365
366 registerHandler( "title", new RSSTagHandler()
367 {
368 public void handleEndElement( RSSTagContext context )
369 {
370 Item item = (Item) context.getValue( CONTEXT_ITEM );
371
372
373 if ( item == null && "channel".equals( context.getParentElement() ))
374 {
375 _channel.setTitle( context.getText() );
376 }
377 else if ( item != null )
378 {
379 item.setTitle( context.getText() );
380 }
381 }
382 } );
383
384 registerHandler( "pubDate", new RSSTagHandler()
385 {
386 public void handleEndElement( RSSTagContext context )
387 {
388 Item item = (Item) context.getValue( CONTEXT_ITEM );
389 if ( item != null )
390 {
391 Date d = parseDate( context.getText() );
392 if ( d != null )
393 {
394 item.setPublishDate( d );
395 }
396 }
397 }
398 } );
399
400 registerHandler( "guid", new RSSTagHandler()
401 {
402 public void handleEndElement( RSSTagContext context )
403 {
404 Item item = (Item) context.getValue( CONTEXT_ITEM );
405 if ( item != null )
406 {
407 item.setGuid( context.getText() );
408 }
409 }
410 } );
411
412
413 registerHandler( "date", new RSSTagHandler()
414 {
415 public void handleEndElement( RSSTagContext context )
416 {
417 Item item = (Item) context.getValue( CONTEXT_ITEM );
418 if ( item != null )
419 {
420 if ( DUBLIN_CORE_URI.equals( context.getURI() ) )
421 {
422 Date date = parseDate( _context.getText() );
423 if ( date != null )
424 {
425 item.setPublishDate( date );
426 }
427 }
428 }
429 }
430 } );
431 }
432
433
434 /***
435 * Attempt to parse a date. Dates in RSS are pretty non-standard,
436 * unfortunately. The RSS spec calls for RFC822 dates, but in practice,
437 * this isn't followed by all feed generators. Nothing like standards ;)
438 *
439 * @param date
440 * @return
441 */
442 private Date parseDate( String date )
443 {
444 Date d = parseRFC822Date( date );
445 if ( d == null )
446 {
447 d = parseISODate( date );
448 }
449
450 return d;
451 }
452
453 private Date parseISODate( String date )
454 {
455 try
456 {
457
458
459 String formatString = null;
460 TimeZone timeZone = null;
461
462
463 if ( date.indexOf( 'T' ) == -1 )
464 {
465 formatString = "yyyy-MM-dd";
466 }
467 else
468 {
469 if ( date.length() >= 7 )
470 {
471 char tzChar = date.charAt( date.length() - 6 );
472 if ( tzChar == 'Z' )
473 {
474 date = date.substring( 0, date.length() - 6 );
475 }
476 else if ( tzChar == '+' || tzChar == '-' )
477 {
478 String tzId = "GMT" + date.substring( date.length() - 6 );
479 timeZone = TimeZone.getTimeZone( tzId );
480 date = date.substring( 0, date.length() - 6 );
481 }
482
483
484 if ( date.indexOf( "," ) >= 0 )
485 {
486 formatString = "yyyy-MM-dd'T'HH:mm:ss,SSS";
487 }
488 else
489 {
490 formatString = "yyyy-MM-dd'T'HH:mm:ss";
491 }
492 }
493 }
494
495 if ( formatString == null )
496 {
497 throw new Exception( "Date does not parse" );
498 }
499
500
501 DateFormat f = new SimpleDateFormat( formatString );
502 if ( timeZone == null )
503 {
504 timeZone = TimeZone.getTimeZone( "UTC" );
505 }
506 f.setTimeZone( timeZone );
507
508 return f.parse( date );
509 }
510 catch ( Exception e )
511 {
512 return null;
513 }
514 }
515
516 private Date parseRFC822Date( String date )
517 {
518 try
519 {
520 MailDateFormat mdf = new MailDateFormat();
521 return mdf.parse( date );
522 }
523 catch ( Exception e )
524 {
525 return null;
526 }
527 }
528
529 public void startElement (String uri, String name, String qName, Attributes atts)
530 {
531
532 _context._text = new StringBuffer();
533 _context.setURI( uri );
534 _context.setName( name );
535 _context.setQName( qName );
536 _context.setAttributes( atts );
537
538 RSSTagHandler handler = lookupHandler( name );
539 if ( handler != null )
540 {
541 handler.handleStartElement( _context );
542 }
543
544 _context.pushPathElement( name );
545 }
546
547 public void endElement (String uri, String name, String qName)
548 {
549 _context.setURI( uri );
550 _context.setName( name );
551 _context.setQName( qName );
552
553 _context.setAttributes( null );
554
555 RSSTagHandler handler = lookupHandler( name );
556 _context.popPathElement();
557 if ( handler != null )
558 {
559 handler.handleEndElement( _context );
560 }
561
562 _context._text = null;
563 }
564
565
566 public void characters (char ch[], int start, int length)
567 {
568 if ( _context._text != null )
569 {
570 _context._text.append( new String( ch, start, length ) );
571 }
572 }
573
574
575
576 public static void main(String[] args)
577 {
578 try
579 {
580 Channel channel = null;
581 System.out.println( "*** BEGIN OTN NEWS ***" );
582 channel = SimpleRSSParser.parse( "http://www.orablogs.com/duffblog/index.xml" );
583 System.out.println( channel.dumpString() );
584 System.out.println( "*** END OTN NEWS ***" );
585 System.out.println( "" );
586 System.out.println( "" );
587
588
589
590
591
592
593
594
595 System.out.println( "*** BEGIN ***" );
596 channel = SimpleRSSParser.parse( "http://radio.weblogs.com/0129487/rss.xml" );
597
598 System.out.println( channel.dumpString() );
599 System.out.println( "*** END ROBSITE.ORG NEWS ***" );
600 System.out.println( "" );
601 System.out.println( "" );
602 }
603 catch ( Exception ex)
604 {
605 ex.printStackTrace();
606 }
607 }
608
609 }