我们从Python开源项目中,提取了以下46个代码示例,用于说明如何使用xml.etree.cElementTree.iterparse()。
def _find_elements(self, result, elements): """Find interesting elements from XML. This function tries to only look for specified elements without parsing the entire XML. The specified elements is better located near the beginning. Args: result: response XML. elements: a set of interesting element tags. Returns: A dict from element tag to element value. """ element_mapping = {} result = StringIO.StringIO(result) for _, e in ET.iterparse(result, events=('end',)): if not elements: break if e.tag in elements: element_mapping[e.tag] = e.text elements.remove(e.tag) return element_mapping
def load_osm(osm_file): nodes = dict() for event, elem in iterparse(osm_file, events=("start", "end")): # Whenever the iterator encounters an opening tag if event == "start": if elem.tag == "node": curr_id = int(elem.attrib["id"]) lat = float(elem.attrib["lat"]) lon = float(elem.attrib["lon"]) curr_elem = (lat, lon) # Whenever the iterator encounters a closing tag elif event == "end": if elem.tag == "node": nodes[curr_id] = curr_elem return nodes
def gpx_parser(fh): it = ElementTree.iterparse(fh, events=('start','end')) # look for the start gpx tag to fail fast for event, elem in it: if event == 'start' and elem.tag.endswith('}gpx'): break else: raise ValueError('Not a gpx file: %s' % fh.name) # do the main parse for event, elem in it: if event == 'end' and elem.tag.endswith('}trkpt'): latlon = (float(elem.attrib['lat']), float(elem.attrib['lon'])) elev = np.nan time = None for child in elem: tag_name = child.tag.rsplit('}', 1)[1] if tag_name == 'ele': elev = float(child.text) elif tag_name == 'time': time = child.text yield latlon, time, elev elem.clear()
def parse_nsmap(fil): events = "start", "start-ns", "end-ns" root = None ns_map = [] for event, elem in ElementTree.iterparse(fil, events): if event == "start-ns": ns_map.append(elem) elif event == "end-ns": ns_map.pop() elif event == "start": if root is None: root = elem elem.set(NS_MAP, dict(ns_map)) return ElementTree.ElementTree(root)
def _poll(self, url): request = urllib2.Request(url) for key, value in self.http_headers: request.add_header(key, value) try: self.log.info('Downloading feed from: "%s"', url) _, fileobj = yield utils.fetch_url(request) except utils.FetchUrlFailed as e: self.log.error('Failed to download feed "%s": %r', url, e) idiokit.stop(False) self.log.info("Finished downloading the feed.") byte = fileobj.read(1) while byte and byte != "<": byte = fileobj.read(1) if byte == "<": fileobj.seek(-1, 1) try: for _, elem in etree.iterparse(fileobj): for event in self._parse(elem, url): if event: yield idiokit.send(event) except ParseError as e: self.log.error('Invalid format on feed: "%s", "%r"', url, e)
def poll(self): url = self.feed_url % self.application_key try: self.log.info("Checking if {0!r} has new data".format(url)) info, _ = yield utils.fetch_url(HeadRequest(url)) etag = info.get("etag", None) if etag is not None and self._etag == etag: raise bot.PollSkipped("no new data detected (ETag stayed the same)") self.log.info("Downloading data from {0!r}".format(url)) _, fileobj = yield utils.fetch_url(url) except utils.FetchUrlFailed as error: raise bot.PollSkipped("failed to download {0!r} ({1})".format(url, error)) self.log.info("Downloaded data from {0!r}".format(url)) reader = BZ2Reader(fileobj) try: depth = 0 sites = dict() for event, element in etree.iterparse(reader, events=("start", "end")): if event == "start" and element.tag == "entry": depth += 1 if event == "end" and element.tag == "entry": yield self._handle_entry(element, sites) depth -= 1 if event == "end" and depth == 0: element.clear() except SyntaxError as error: raise bot.PollSkipped("syntax error in report {0!r} ({1})".format(url, error)) else: self._etag = etag
def iterparse(self, file): return self.create_fa().iterparse(file, self.validate_dtd) # I need a better name
def handler_parse(self, file, state=None): for x in self.parse(file, state): pass # I plan to implement 'iterparse' as a near copy of 'parse' # but without any references to callbacks
def iterparse(self, file, validate_dtd=False): return self.parse(file, None, validate_dtd)
def test_parse(): import os filename = "/Users/dalke/Music/iTunes/iTunes Music Library.xml" if not os.path.exists(filename): print "Cannot find %r: skipping test" % (filename,) return # Work through callbacks ef = IterParseFilter() def print_info(event, ele, state): d = {} children = iter(ele) for child in children: key = child.text value = children.next().text d[key] = value print "%r is by %r" % (d["Name"], d.get("Artist", "<unknown>")) ele.clear() ef.on_end("/plist/dict/dict/dict", print_info) ef.handler_parse(open(filename)) # Work through iterators ef = IterParseFilter() ef.iter_end("/plist/dict/dict/dict") for (event, ele) in ef.iterparse(open(filename)): d = {} children = iter(ele) for child in children: key = child.text value = children.next().text d[key] = value print "%r is a %r song" % (d["Name"], d.get("Genre", "<unknown>")) ele.clear()
def parse_new_asx(data): # Copied from mopidy.audio.playlists try: for _, element in elementtree.iterparse(data): element.tag = element.tag.lower() # normalize for ref in element.findall('entry/ref[@href]'): yield fix_asf_uri(ref.get('href', '').strip()) for entry in element.findall('entry[@href]'): yield fix_asf_uri(entry.get('href', '').strip()) except elementtree.ParseError: return
def main(argv): file_obj = open(argv[1]) print "Reading XML file ", sys.stdout.flush() level = 0 sim_list = [] for event, elem in ElementTree.iterparse(file_obj, events=("start", "end")): if event == "start": level += 1 if event == "end": level -= 1 if level == 0 and elem.tag == 'FlowMonitor': sim = Simulation(elem) sim_list.append(sim) elem.clear() # won't need this any more sys.stdout.write(".") sys.stdout.flush() print " done." for sim in sim_list: for flow in sim.flows: t = flow.fiveTuple proto = {6: 'TCP', 17: 'UDP'} [t.protocol] print "FlowID: %i (%s %s/%s --> %s/%i)" % \ (flow.flowId, proto, t.sourceAddress, t.sourcePort, t.destinationAddress, t.destinationPort) print "\tTX bitrate: %.2f kbit/s" % (flow.txBitrate*1e-3,) print "\tRX bitrate: %.2f kbit/s" % (flow.rxBitrate*1e-3,) print "\tMean Delay: %.2f ms" % (flow.delayMean*1e3,) print "\tPacket Loss Ratio: %.2f %%" % (flow.packetLossRatio*100)
def process_file(self, file_name): data = self.read_file(file_name, self.encoding) data = self.preprocess_data(data) try: stream = IO_Stream(bytearray("\n".join(data), encoding="utf-8")) self.tree = ET.iterparse(stream) if self._strip_namespace: for _, element in self.tree: element.tag = element.tag.rpartition("}")[-1] except Exception as e: print(self._current_file) print_error_context(str(e), "\n".join(data).split("\n")) raise e self.process_tree(self.tree)
def parse_wos_xml(fp, global_year, good_cf, bad_cf, ntest=None): """ driver func, parse file fp, push good and bad records accordingly to good_cf and bad_cf :param fp: filepointer to be parsed :param global_year: apriori known year :param good_cf: chunk flusher of good records :param bad_cf: chunk flusher of bad records :param ntest: number of records for test mode :return: """ events = ('start', 'end') tree = cET.iterparse(fp, events) context = iter(tree) event, root = next(context) rec_ = 'REC' it = 0 for event, pub in context: if event == "end" and pub.tag == rec_: ans = parse_record(pub, global_year) if ans[0]: good_cf.push(ans[1]) else: msg = ' parse_wos_xml() : wos_id {0} failed ' \ 'to parse, placed in the bad heap'.format(ans[1]['id']) logging.error(msg) bad_cf.push(ans[1]) if not good_cf.ready() or not bad_cf.ready(): break root.clear() it += 1 if ntest and it >= ntest: break
def load_osm(osm_file): """ loads all edges and nodes from the .osm (XML) file and wraps them into objects. :param osm_file: filename of the .osm file containing the map data. :return: edges (list), nodes (dict) """ nodes = dict() edges = list() for event, elem in iterparse(osm_file, events=("start", "end")): # Whenever the iterator encounters an opening tag if event == "start": if elem.tag == "node": curr_id = int(elem.attrib["id"]) lat = float(elem.attrib["lat"]) lon = float(elem.attrib["lon"]) curr_elem = (lat, lon) elif elem.tag == "way": curr_elem = Edge(int(elem.attrib["id"])) elif elem.tag == "nd": curr_elem.nodes.append(elem.attrib["ref"]) # Whenever the iterator encounters a closing tag elif event == "end": if elem.tag == "node": nodes[curr_id] = curr_elem elif elem.tag == "way": edges.append(curr_elem) return nodes, edges
def get_element(osm_file, tags=('node', 'way', 'relation')): """ Yield element if it is the right type of tag Reference: http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python """ context = iter(ET.iterparse(osm_file, events=('start', 'end'))) _, root = next(context) for event, elem in context: if event == 'end' and elem.tag in tags: yield elem root.clear()
def count_tags(filename): tags={} for event, elem in ET.iterparse(filename, events=("start",)): if elem.tag in tags.keys(): tags[elem.tag] += 1 else: tags[elem.tag] = 1 return tags
def process_map(filename): keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0} print "Sample 'other' tags, randomly (2%) selected:" print "\n" for _, element in ET.iterparse(filename): keys = key_type(element, keys) print "\n" print "Count of the four Tag Categories:" print "\n" print keys return keys
def process_map(filename): users = set() for _, element in ET.iterparse(filename): if get_user(element): users.add(get_user(element)) return users
def audit_street(osmfile): osm_file = open(osmfile, "r") street_types = defaultdict(set) for event, elem in ET.iterparse(osm_file, events=("start",)): if elem.tag == "node" or elem.tag == "way": for tag in elem.iter("tag"): if is_street_name(tag): audit_street_type(street_types, tag.attrib['v']) osm_file.close() return street_types
def audit_state(osmfile): osm_file = open(osmfile, "r") prob_state = set() for event, elem in ET.iterparse(osm_file, events=("start",)): if elem.tag == "node" or elem.tag == "way": for tag in elem.iter("tag"): if is_state(tag): if tag.attrib['v'] != 'HI': prob_state.add(tag.attrib['v']) osm_file.close() return prob_state
def get_element(osm_file, tags=('node', 'way', 'relation')): """Yield element if it is the right type of tag""" context = ET.iterparse(osm_file, events=('start', 'end')) _, root = next(context) for event, elem in context: if event == 'end' and elem.tag in tags: yield elem root.clear()
def tcx_parser(fh): it = ElementTree.iterparse(fh, events=('start','end')) # look for the start TrainingCenterDatabase tag to fail fast for event, elem in it: if event == 'start' and elem.tag.endswith('}TrainingCenterDatabase'): break else: raise ValueError('Not a tcx file: %s' % fh.name) # do the main parse for event, elem in it: if event == 'end' and elem.tag.endswith('}Trackpoint'): latlon = None elev = np.nan time = None for child in elem: tag_name = child.tag.rsplit('}', 1)[1] if tag_name == 'Time': time = child.text elif tag_name == 'AltitudeMeters': elev = float(child.text) elif tag_name == 'Position': vals = dict((c.tag.rsplit('}', 1)[1], float(c.text)) for c in child) latlon = (vals['LatitudeDegrees'], vals['LongitudeDegrees']) if latlon is not None: yield latlon, time, elev elem.clear()
def parse_root(raw): "Efficiently parses the root element of a *raw* XML document, returning a tuple of its qualified name and attribute dictionary." fp = StringIO(raw) for event, element in ET.iterparse(fp, events=('start',)): return (element.tag, element.attrib)
def detect_xspf_header(data): data = data[0:150] if b'xspf' not in data.lower(): return False try: data = io.BytesIO(data) for event, element in elementtree.iterparse(data, events=(b'start',)): return element.tag.lower() == '{http://xspf.org/ns/0/}playlist' except elementtree.ParseError: pass return False
def detect_asx_header(data: bytes): data = data[0:50] if b'asx' not in data.lower(): return False try: bytesIO = io.BytesIO(data) for event, element in elementtree.iterparse(bytesIO, events=(b'start',)): return element.tag.lower() == 'asx' except elementtree.ParseError: pass return False
def parse_xspf(data: bytes): try: # Last element will be root. element = None for event, element in elementtree.iterparse(io.BytesIO(data)): element.tag = element.tag.lower() # normalize if element is not None: ns = 'http://xspf.org/ns/0/' for track in element.iterfind('{%s}tracklist/{%s}track' % (ns, ns)): yield track.findtext('{%s}location' % ns) except elementtree.ParseError: return
def parse_asx(data): try: # Last element will be root. element = None for event, element in elementtree.iterparse(io.BytesIO(data)): element.tag = element.tag.lower() # normalize if element is not None: for ref in element.findall('entry/ref[@href]'): yield ref.get('href', '').strip() for entry in element.findall('entry[@href]'): yield entry.get('href', '').strip() except elementtree.ParseError: return
def is_svg(self, f): """ Check if provided file is svg """ f.seek(0) tag = None try: for event, el in et.iterparse(f, ('start',)): tag = el.tag break except et.ParseError: pass return tag == '{http://www.w3.org/2000/svg}svg'
def get_element(osm_file, tags=('node', 'way', 'relation')): """Yield element if it is the right type of tag Reference: http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python """ context = iter(ET.iterparse(osm_file, events=('start', 'end'))) _, root = next(context) for event, elem in context: if event == 'end' and elem.tag in tags: yield elem root.clear()
def iterate_qa_pairs(num_iter=None): """Iterates through question-answer pairs in a single file. Args: num_iter: int (default: None), number of times to iterate. If None, iterates infinitely. Yields: subject: the question title (max length = QUESTION_TITLE_MAXLEN) bestanswer: the body of the best answer (max length = ANSWER_MAXLEN) """ def _parse_document(elem): subject = elem.find('subject') bestanswer = elem.find('bestanswer') return ('' if subject is None else subject.text, '' if bestanswer is None else bestanswer.text) if num_iter is None: iterator = itertools.count() else: iterator = xrange(num_iter) for _ in iterator: with open(DATA_PATH, 'r') as f: parser = ET.iterparse(f) for event, elem in parser: if elem.tag == 'document': yield _parse_document(elem) elem.clear() # Important for avoiding memory issues.
def is_svg(_file): """ Check is a given file is SVG or not. A file is considered to be SVG if: - Its mimetype is "application/svg+xml" or "image/svg+xml". - Its mimetype is "text/html" or "application/xml" or "text/xml" or "text/plain" and it has the svg tag with xmlns http://www.w3.org/2000/svg :param _file: A GenericFile object that should be checked for SVG. :return: Boolean corresponding to whether the file is SVG. """ mime = _file.mime() if mime in ('application/svg+xml', 'image/svg+xml'): return True elif mime in ('application/xml', 'text/xml', 'text/html', 'text/plain'): tag = None with open(_file.fetch('filename'), "r") as f: # cElementTree needs the events as bytes in python2 items = cElementTree.iterparse(f, events=(str('start'),)) try: _, el = items.next() tag = el.tag except cElementTree.ParseError: return False return tag == '{http://www.w3.org/2000/svg}svg'
def load_data(datafile): context = ET.iterparse(datafile, events=("start", "end")) logging.debug("Got context") context = iter(context) return context # uid -> wos_id, citedAuthor, year , page, volume, citedTitle, citedWork, doi
def read( self ): context = ElementTree.iterparse(self.gccxml_file, events=("start", "end")) for event, elem in context: if event == 'start': self.startElement( elem.tag, elem.attrib ) else: self.endElement( elem.tag ) elem.clear() self.endDocument()
def parse_new_asx(data): # Copied from mopidy.audio.playlists try: for event, element in elementtree.iterparse(data): element.tag = element.tag.lower() # normalize except elementtree.ParseError: return for ref in element.findall('entry/ref[@href]'): yield fix_asf_uri(ref.get('href', '').strip()) for entry in element.findall('entry[@href]'): yield fix_asf_uri(entry.get('href', '').strip())
def parse(path, tag=REPORT_HOST): """Parse Nessus XML export from Workbench API into dicts. :param path: The file path. :param tag: The XML tag to iterate on. It should be WorkbenchParser.REPORT_HOST or WorkbenchParser.REPORT_ITEM. """ assert tag in [WorkbenchParser.REPORT_HOST, WorkbenchParser.REPORT_ITEM], u'Valid tag for parsing.' report_host = None host_properties = None report_items = [] if tag == WorkbenchParser.REPORT_HOST else None try: for event, elem in ET.iterparse(path, events=('start', 'end')): if event == 'start': if elem.tag == 'ReportHost': report_host = WorkbenchParser._from_report_host(elem) if event == 'end': if elem.tag == WorkbenchParser.REPORT_HOST: elem.clear() if tag == elem.tag: yield { 'report_host': report_host, 'host_properties': host_properties, 'report_items': report_items, } report_items = [] if elem.tag == WorkbenchParser.HOST_PROPERTIES: host_properties = WorkbenchParser._from_host_properties(elem) elem.clear() if elem.tag == WorkbenchParser.REPORT_ITEM: report_item = WorkbenchParser._from_report_item(elem) elem.clear() if tag == elem.tag: yield report_item elif tag == WorkbenchParser.REPORT_HOST: report_items.append(report_item) except ET.ParseError as e: logging.warn(u'Failed to parse Nessus XML: ' + e.msg) # TODO The service return malformed XML for empty set, for now we won't raise an exception for what should # TODO be a normal state. However, this might masked out real error from bubble up (unlikely). # raise TenableIOException(u'Failed to parse Nessus XML: ' + e.message)
def read_mir(xml_file, quiet=False): out_list = [] tmp_list = [] error = "" # Open the MIR output file. try: for (_, reg_item) in et.iterparse(xml_file, events=('end',)): if reg_item.tag != 'RegistryItem': continue path_name = reg_item.find("Path").text if not path_name: print "[-] Error XML missing Path" print et.tostring(reg_item) reg_item.clear() continue path_name = path_name.lower() # Check to see that we have the right registry value. if 'control\\session manager\\appcompatcache\\appcompatcache' in path_name \ or 'control\\session manager\\appcompatibility\\appcompatcache' in path_name: # return the base64 decoded value data. bin_data = binascii.a2b_base64(reg_item.find('Value').text) tmp_list = read_cache(bin_data, quiet) if tmp_list: for row in tmp_list: if g_verbose: row.append(path_name) if row not in out_list: out_list.append(row) reg_item.clear() except (AttributeError, TypeError, IOError), err: error = "[-] Error reading MIR XML: %s" % str(err) print error return (error, None) if len(out_list) == 0: return (error, None) else: # Add the header and return the list. if g_verbose: out_list.insert(0, output_header + ['Key Path']) else: # Only return unique entries. out_list = unique_list(out_list) out_list.insert(0, output_header) return (error, out_list) # Get Shim Cache data from .reg file. # Finds the first key named "AppCompatCache" and parses the # Hex data that immediately follows. It's a brittle parser, # but the .reg format doesn't change too often.
def processFile(self, file_fullpath, hostID, instanceID, rowsData): rowNumber = 0 check_tags = ['LastModified', 'FilePath'] # the 'end' event signifies when the end of the XML node has been reached, # and therefore when all values can be parsed try: xml_data = loadFile(file_fullpath) for event, element in etree.iterparse(xml_data, events=("end",)): skip_entry = False tag_dict = {} if element.tag == "PersistenceItem": self._processElement(element, tag_dict) # Check we have everything we need and ignore entries with critical XML errors on them for tag in check_tags: if tag in tag_dict: if tag_dict[tag] is None: if 'AppCompatPath' in tag_dict: logger.warning("Malformed tag [%s: %s] in %s, entry: %s (skipping entry)" % (tag, tag_dict[tag], tag_dict['AppCompatPath'], file_fullpath)) else: logger.warning( "Malformed tag [%s: %s] in %s, entry: Unknown (skipping entry)" % (tag, tag_dict[tag], file_fullpath)) skip_entry = True break # If the entry is valid do some housekeeping: if not skip_entry: if tag_dict['ExecutionFlag'] == '1': tmpExecFlag = True elif tag_dict['ExecutionFlag'] == '0': tmpExecFlag = False else: tmpExecFlag = tag_dict['ExecutionFlag'] namedrow = settings.EntriesFields(HostID=hostID, EntryType=settings.__APPCOMPAT__, RowNumber=rowNumber, InstanceID=instanceID, LastModified=(tag_dict['LastModified'].replace("T"," ").replace("Z","") if 'LastModified' in tag_dict else '0001-01-01 00:00:00'), LastUpdate=(tag_dict['LastUpdate'].replace("T"," ").replace("Z","") if 'LastUpdate' in tag_dict else '0001-01-01 00:00:00'), FileName=ntpath.basename(tag_dict['FilePath']), FilePath=ntpath.dirname(tag_dict['FilePath']), Size=(tag_dict['Size'] if 'Size' in tag_dict else 'N/A'), ExecFlag=tmpExecFlag) rowsData.append(namedrow) rowNumber += 1 else: pass element.clear() xml_data.close() except Exception as e: print e.message print traceback.format_exc() pass
def processFile(self, file_fullpath, hostID, instanceID, rowsData): rowNumber = 0 check_tags = ['LastModified', 'AppCompatPath'] try: xml_data = loadFile(file_fullpath) for event, element in etree.iterparse(xml_data, events=("end",)): skip_entry = False tag_dict = {} if element.tag == "AppCompatItemExtended": self._processElement(element, tag_dict) # From time to time we get some entries with no real data on them for some unknown reason, skip for now if 'AppCompatPath' in tag_dict: if tag_dict['AppCompatPath'] == 'N/A': logger.debug("ShimCache entry with no AppCompatPath [ControlSetSeq: %s], entry: %s. (skipping entry)" % (tag_dict['ControlSetSeq'], file_fullpath)) break # Check we have everything we need and ignore entries with critical XML errors on them for tag in check_tags: if tag not in tag_dict or tag_dict[tag] is None: if tag not in tag_dict: if 'AppCompatPath' in tag_dict: logger.warning("Missing tag [%s] in %s, entry: %s (skipping entry)" % (tag, tag_dict['AppCompatPath'], file_fullpath)) else: logger.warning("Malformed tag [%s] in %s, entry: Unknown (skipping entry)" % (tag, file_fullpath)) skip_entry = True break if tag_dict[tag] is None: if 'AppCompatPath' in tag_dict: logger.warning("Malformed tag [%s: %s] in %s, entry: %s (skipping entry)" % (tag, tag_dict[tag], tag_dict['AppCompatPath'], file_fullpath)) else: logger.warning("Malformed tag [%s: %s] in %s, entry: Unknown (skipping entry)" % (tag, tag_dict[tag], file_fullpath)) skip_entry = True break # If the entry is valid do some housekeeping: if not skip_entry: if tag_dict['ExecutionFlag'] == '1': tmpExecFlag = True elif tag_dict['ExecutionFlag'] == '0': tmpExecFlag = False else: tmpExecFlag = tag_dict['ExecutionFlag'] namedrow = settings.EntriesFields(HostID=hostID, EntryType=settings.__APPCOMPAT__, RowNumber=rowNumber, InstanceID=instanceID, LastModified=(tag_dict['LastModified'].replace("T"," ").replace("Z","") if 'LastModified' in tag_dict else '0001-01-01 00:00:00'), LastUpdate=(tag_dict['LastUpdate'].replace("T"," ").replace("Z","") if 'LastUpdate' in tag_dict else '0001-01-01 00:00:00'), FileName=ntpath.basename(tag_dict['AppCompatPath']), FilePath=ntpath.dirname(tag_dict['AppCompatPath']), Size=(tag_dict['Size'] if 'Size' in tag_dict else 'N/A'), ExecFlag=tmpExecFlag) rowsData.append(namedrow) rowNumber += 1 else: pass element.clear() xml_data.close() except Exception as e: print e.message print traceback.format_exc() pass
def processFile(self, file_fullpath, hostID, instanceID, rowsData): rowNumber = 0 check_tags = ['LastModified', 'AppCompatPath'] try: xml_data = loadFile(file_fullpath) for event, element in etree.iterparse(xml_data, events=("end",)): skip_entry = False tag_dict = {} if element.tag == "ShimCacheItem": self._processElement(element, tag_dict) # Check we have everything we need and ignore entries with critical XML errors on them for tag in check_tags: if tag not in tag_dict or tag_dict[tag] is None: if 'AppCompatPath' in tag_dict: logger.warning("Malformed tag [%s] in %s, entry: %s (skipping entry)" % (tag, tag_dict['AppCompatPath'], file_fullpath)) else: logger.warning( "Malformed tag [%s: %s] in %s, entry: Unknown (skipping entry)" % (tag, tag_dict[tag], file_fullpath)) skip_entry = True break # If the entry is valid do some housekeeping: if not skip_entry: if 'ExecutionFlag' in tag_dict: tmpExexFlag = tag_dict['ExecutionFlag'] else: # Note that Shim Shady does not extract ExecFlag on some platforms (at least Windows 10). tmpExexFlag = 'unk' namedrow = settings.EntriesFields(HostID=hostID, EntryType=settings.__APPCOMPAT__, RowNumber=rowNumber, InstanceID=instanceID, LastModified=(tag_dict['LastModified'].replace("T"," ").replace("Z","") if 'LastModified' in tag_dict else '0001-01-01 00:00:00'), LastUpdate=(tag_dict['LastUpdate'].replace("T"," ").replace("Z","") if 'LastUpdate' in tag_dict else '0001-01-01 00:00:00'), FileName=ntpath.basename(tag_dict['AppCompatPath']), FilePath=ntpath.dirname(tag_dict['AppCompatPath']), Size=(tag_dict['Size'] if 'Size' in tag_dict else 'N/A'), ExecFlag=tmpExexFlag) rowsData.append(namedrow) rowNumber += 1 else: pass element.clear() xml_data.close() except Exception as e: print e.message print traceback.format_exc() pass
def processAbstractFile(abstractFile, outFile, processFunction): count = 0 # These XML files are huge, so skip through each MedlineCitation element using etree for event, elem in etree.iterparse(abstractFile, events=('start', 'end', 'start-ns', 'end-ns')): if (event=='end' and elem.tag=='MedlineCitation'): count = count + 1 # Find the elements for the PubMed ID, and publication date information pmid = elem.findall('./PMID') yearFields = elem.findall('./Article/Journal/JournalIssue/PubDate/Year') medlineDateFields = elem.findall('./Article/Journal/JournalIssue/PubDate/MedlineDate') # Try to extract the pmidID pmidText = '' if len(pmid) > 0: pmidText = " ".join( [a.text.strip() for a in pmid if a.text ] ) pmcidText = '' # Try to extract the publication date pubYear = 0 if len(yearFields) > 0: pubYear = yearFields[0].text if len(medlineDateFields) > 0: pubYear = medlineDateFields[0].text[0:4] # Extract the title of paper title = elem.findall('./Article/ArticleTitle') titleText = extractTextFromElemList(title) titleText = [ removeWeirdBracketsFromOldTitles(t) for t in titleText ] # Extract the abstract from the paper abstract = elem.findall('./Article/Abstract/AbstractText') abstractText = extractTextFromElemList(abstract) # Combine all the text we want to process allText = titleText + abstractText allText = [ t for t in allText if len(t) > 0 ] allText = [ htmlUnescape(t) for t in allText ] allText = [ removeBracketsWithoutWords(t) for t in allText ] # Information about the source of this text textSourceInfo = {'pmid':pmidText, 'pmcid':pmcidText, 'pubYear':pubYear} # Get the co-occurrences using a single list processFunction(outFile, allText, textSourceInfo) # Important: clear the current element from memory to keep memory usage low elem.clear()
def tabulate(datafile, cnx): print "Tabulate" #tree = ET.parse(datafile) #root = tree.getroot() count = 0 # get an iterable context = ET.iterparse(datafile, events=("start", "end")) logging.debug("Got context") # turn it into an iterator context = iter(context) # get the root element event, root = context.next() logging.debug("Got root") for event, elem in context: #if event == "start": # print "Event:{0} \nElem:{1} \nAttr:{2} \nValue:{3}".format(event, elem.tag, elem.attrib, elem.value) if elem.tag == "REC" and event=="start": print "Foo" for child in elem: print child.tag, child.attrib #if event == "end" and elem.tag == "REC": # print "Event:{0} Elem:{1}".format(event, elem) count = count+1 root.clear() if count == 100: break ''' for event, elem in tree.iterparse(datafile): print "Event:{0} Elem:{1}".format(event, elem) if elem.tag == "REC" : print "Foo" count = count + 1 if count == 100: break ''' return