User Manual
API Documentation
EncodingDetector
EncodingDetector.encoding()
EncodingDetector.reset()
EncodingDetector.update()
bytes_to_str()
detect_encoding()
detect_mime()
map_encoding_to_html5()
iterate_http_chunks()
read_http_chunk()
NodeType
NodeType.ELEMENT
NodeType.ATTRIBUTE
NodeType.TEXT
NodeType.CDATA_SECTION
NodeType.ENTITY_REFERENCE
NodeType.ENTITY
NodeType.PROCESSING_INSTRUCTION
NodeType.COMMENT
NodeType.DOCUMENT
NodeType.DOCUMENT_TYPE
NodeType.DOCUMENT_FRAGMENT
NodeType.NOTATION
DOMCollection
DOMCollection.__getitem__()
DOMCollection.__iter__()
DOMCollection.get_element_by_id()
DOMCollection.get_elements_by_attr()
DOMCollection.get_elements_by_class_name()
DOMCollection.get_elements_by_tag_name()
DOMCollection.matches()
DOMCollection.query_selector()
DOMCollection.query_selector_all()
DOMContext
DOMElementClassList
DOMElementClassList.__getitem__()
DOMElementClassList.__iter__()
DOMElementClassList.add()
DOMElementClassList.remove()
DOMNode
DOMNode.__getitem__()
DOMNode.__iter__()
DOMNode.__setitem__()
DOMNode.append_child()
DOMNode.decompose()
DOMNode.delattr()
DOMNode.get_element_by_id()
DOMNode.get_elements_by_attr()
DOMNode.get_elements_by_class_name()
DOMNode.get_elements_by_tag_name()
DOMNode.getattr()
DOMNode.hasattr()
DOMNode.insert_before()
DOMNode.matches()
DOMNode.query_selector()
DOMNode.query_selector_all()
DOMNode.remove_child()
DOMNode.replace_child()
DOMNode.setattr()
DOMNode.attrs
DOMNode.child_nodes
DOMNode.class_list
DOMNode.class_name
DOMNode.first_child
DOMNode.first_element_child
DOMNode.html
DOMNode.id
DOMNode.last_child
DOMNode.last_element_child
DOMNode.next
DOMNode.next_element
DOMNode.parent
DOMNode.prev
DOMNode.prev_element
DOMNode.tag
DOMNode.text
DOMNode.type
DOMNode.value
HTMLTree
HTMLTree.parse()
HTMLTree.parse_from_bytes()
HTMLTree.create_element()
HTMLTree.create_text_node()
HTMLTree.body
HTMLTree.document
HTMLTree.head
HTMLTree.title
traverse_dom()
detect_fast()
supported_langs()
train_language_examples()
extract_plain_text()
InterruptType
InterruptType.exception
InterruptType.signal
InterruptType.exception_then_signal
ExecutionTimeout
MemoryLimitExceeded
ResiliparseGuardException
MemGuard
TimeGuard
TimeGuard.progress()
mem_guard()
progress()
progress_loop()
time_guard()
exc_loop()
warc_retry()
ElasticsearchBulkIndex
delete_action()
ensure_index()
index_action()
update_action()
MatchFiles
ReadAllFromText
ReadFromText
StrUtf8Coder
StrUtf8Coder.decode()
ReadAllWarcs
ReadWarcs
WarcRecordType
WarcRecordType.unknown
WarcRecordType.any_type
WarcRecordType.no_type
WarcRecordType.warcinfo
WarcRecordType.response
WarcRecordType.resource
WarcRecordType.request
WarcRecordType.metadata
WarcRecordType.revisit
WarcRecordType.conversion
WarcRecordType.continuation
WarcHeader
WarcHeader.WARC_TYPE
WarcHeader.WARC_RECORD_ID
WarcHeader.WARC_DATE
WarcHeader.CONTENT_LENGTH
WarcHeader.CONTENT_TYPE
WarcHeader.WARC_CONCURRENT_TO
WarcHeader.WARC_BLOCK_DIGEST
WarcHeader.WARC_PAYLOAD_DIGEST
WarcHeader.WARC_IP_ADDRESS
WarcHeader.WARC_REFERS_TO
WarcHeader.WARC_REFERS_TO_TARGET_URI
WarcHeader.WARC_REFERS_TO_DATE
WarcHeader.WARC_TARGET_URI
WarcHeader.WARC_TRUNCATED
WarcHeader.WARC_WARCINFO_ID
WarcHeader.WARC_FILENAME
WarcHeader.WARC_PROFILE
WarcHeader.WARC_IDENTIFIED_PAYLOAD_TYPE
WarcHeader.WARC_SEGMENT_ORIGIN_ID
WarcHeader.WARC_SEGMENT_NUMBER
WarcHeader.WARC_SEGMENT_TOTAL_LENGTH
ArchiveIterator
ArchiveIterator.__iter__()
ArchiveIterator.__next__()
HeaderMap
WarcHeaderMap
WarcRecord
WarcRecord.consume()
WarcRecord.freeze()
WarcRecord.init_headers()
WarcRecord.parse_http()
WarcRecord.parse_warc_headers()
WarcRecord.set_bytes_content()
WarcRecord.set_bytes_payload()
WarcRecord.verify_block_digest()
WarcRecord.verify_payload_digest()
WarcRecord.write()
WarcRecord.content_length
WarcRecord.headers
WarcRecord.http_charset
WarcRecord.http_content_type
WarcRecord.http_date
WarcRecord.http_headers
WarcRecord.http_last_modified
WarcRecord.is_frozen
WarcRecord.is_http
WarcRecord.is_http_parsed
WarcRecord.reader
WarcRecord.record_date
WarcRecord.record_id
WarcRecord.record_type
WarcRecord.stream_pos
WarcRecordPayloadReader
WarcRecordPayloadReader.consume()
WarcRecordPayloadReader.readline()
has_block_digest()
has_content_length_gte()
has_content_length_lte()
has_payload_digest()
has_record_type()
has_valid_block_digest()
has_valid_payload_digest()
is_concurrent()
is_http()
is_warc_10()
is_warc_11()
FastWARCError
ReaderStaleError
StreamError
BrotliReader
BrotliWriter
ChunkedReader
ChunkedWriter
GzipReader
GzipWriter
Lz4Reader
Lz4Writer
WarcReader
WarcReader.close()
WarcReader.frame_start_position()
WarcReader.inner_seek()
WarcReader.inner_tell()
WarcReader.read()
WarcReader.seek()
WarcReader.tell()
WarcWriter
WarcWriter.close()
WarcWriter.finish()
WarcWriter.flush()
WarcWriter.write()
ZstdReader
ZstdWriter
zstd_train_dictionary_from_continuous()
zstd_train_dictionary_from_files()
zstd_train_dictionary_from_samples()
CLI Documentation