1
2
3
4
5
6 """
7 NetCatch enables the user to scan a list of labelled urls and select
8 a subset to read into a file.
9
10 Functions:
11 get_urls_by_label
12 get_urls_by_index
13 get_urls_by_range
14 select_output_file
15 """
16 import sys
17 import os
18 import urllib
19 from tempfile import mktemp
20 import sgmllib
21 import string
22 from Bio import File
23
24
26 ( url_type, url ) = urllib.splittype( candidate )
27 if( url_type == None ):
28 return 0
29 ( url_host, url ) = urllib.splithost( url )
30 if( url_host == None ):
31 return 0
32 return 1
33
34 """
35 ExtractUrls.py
36
37
38 Scans a file in http format and builds a dictionary of urls
39 """
40
42
46
48 sgmllib.SGMLParser.reset( self )
49 self.urls = {}
50 self._inlink = 0
51 self._pending_url = ''
52 self.text = ''
53
55 output = ''
56 for key in self.urls.keys():
57 val = self.urls[ key ]
58 output = output + '%s : %s\n' % ( key, val )
59 return output
60
64
78
82
84 self._inlink = 1
85 for key, val in attrs:
86 if key.lower() == 'href':
87 self._pending_url = val
88
90 self._inlink = 0
91 key = self.text
92 self.text = ''
93 if not key == '':
94 key = key.replace( ' ', '_' )
95 self.urls[ key ] = self._pending_url
96
104
106 """
107 Decorator for a dictionary of links. Each link is indexed by its label.
108 Allows the user to select links of interest and read each selection into
109 its own file. The filename is contructed by appending the label with an
110 extension of html.
111
112 Files can be selected by index, range or label. The destination directory
113 defaults to the current directory. The user can specify another
114 dictionary by passing a list of path segments to the constructor.
115
116 net_catch = NetCatch()
117 net_catch = NetCatch( [ 'amylase', 'species' ] )
118 net_catch.get_all_urls()
119 net_catch.get_urls_by_label( [ 'pig', 'dog', 'cow' ] )
120 net_catch.get_urls_by_index( [ 1, 4, 6, 9 ] )
121 net_catch.get_urls_by_range( 2, 5 )
122 """
123
124 - def __init__( self, path_segments = [] ):
125 self._urls = {}
126 self._labels = []
127 assert type( path_segments ) == type( [] )
128 self.path_segments = path_segments
129 self._build_path()
130
132 base_path = os.path.join( '' )
133 for segment in self.path_segments:
134 base_path = os.path.join( base_path, segment )
135 self.base_path = base_path
136
138 i = 0
139 output = ''
140 for label in self._labels:
141 output = output + '%d %s: %s\n' % ( i, label, self._urls[ label ] )
142 i = i + 1
143 return output
144
148
154
156 url_opener = urllib.URLopener()
157 i = 0
158 for label in self._labels:
159 base_path = self.base_path
160 name = '%s%d.htm' % ( label, i )
161 full_path = os.path.join( base_path, name )
162 out_handle = open( full_path , "wb" )
163 i = i + 1
164 url = self._urls[ label ]
165 url_handle = url_opener.open( url )
166 contents = url_handle.read()
167 out_handle.write( contents )
168 url_opener.close( )
169 out_handle.close()
170
172 url_opener = urllib.URLopener()
173 for label in labels:
174 base_path = self.base_path
175 name = '%s.htm' % ( label )
176 full_path = os.path.join( base_path, name )
177 out_handle = open( full_path , "wb" )
178 url = self._urls[ label ]
179 url_handle = url_opener.open( url )
180 contents = url_handle.read()
181 out_handle.write( contents )
182 url_opener.close( )
183 out_handle.close( )
184
186 url_opener = urllib.URLopener()
187 for index in indices:
188 base_path = self.base_path
189 name = '%s.htm' % self._labels[ index ]
190 full_path = os.path.join( base_path, name )
191 out_handle = open( full_path , "wb" )
192 label = self._labels[ index ]
193 url = self._urls[ label ]
194 url_handle = url_opener.open( url )
195 contents = url_handle.read()
196 out_handle.write( contents )
197 url_opener.close( )
198 out_handle.close( )
199
201 url_opener = urllib.URLopener( )
202 for index in range( low, hi ):
203 base_path = self.base_path
204 name = '%s.htm' % self._labels[ index ]
205 full_path = os.path.join( base_path, name )
206 out_handle = open( full_path , "wb" )
207 label = self._labels[ index ]
208 url = self._urls[ label ]
209 url_handle = url_opener.open( url )
210 contents = url_handle.read()
211 out_handle.write( contents )
212 url_opener.close( )
213 out_handle.close( )
214