| 
					
				 | 
			
			
				@@ -0,0 +1,111 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+"""Markdown parser. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Contains parser for md files. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import re 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from pathlib import Path 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from typing import Any, Dict, List, Optional, Tuple, Union, cast 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from llama_index.readers.file.base_parser import BaseParser 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+class MarkdownParser(BaseParser): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """Markdown parser. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    Extract text from markdown files. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    Returns dictionary with keys as headers and values as the text between headers. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def __init__( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        self, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        *args: Any, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        remove_hyperlinks: bool = True, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        remove_images: bool = True, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        **kwargs: Any, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    ) -> None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        """Init params.""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        super().__init__(*args, **kwargs) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        self._remove_hyperlinks = remove_hyperlinks 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        self._remove_images = remove_images 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        """Convert a markdown file to a dictionary. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        The keys are the headers and the values are the text under each header. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        markdown_tups: List[Tuple[Optional[str], str]] = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        lines = markdown_text.split("\n") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        current_header = None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        current_text = "" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        for line in lines: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            header_match = re.match(r"^#+\s", line) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if header_match: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                if current_header is not None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    markdown_tups.append((current_header, current_text)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                current_header = line 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                current_text = "" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                current_text += line + "\n" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        markdown_tups.append((current_header, current_text)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if current_header is not None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            # pass linting, assert keys are defined 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            markdown_tups = [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                for key, value in markdown_tups 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            markdown_tups = [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                (key, re.sub("\n", "", value)) for key, value in markdown_tups 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return markdown_tups 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def remove_images(self, content: str) -> str: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        """Get a dictionary of a markdown file from its path.""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        pattern = r"!{1}\[\[(.*)\]\]" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        content = re.sub(pattern, "", content) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return content 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def remove_hyperlinks(self, content: str) -> str: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        """Get a dictionary of a markdown file from its path.""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        pattern = r"\[(.*?)\]\((.*?)\)" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        content = re.sub(pattern, r"\1", content) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return content 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def _init_parser(self) -> Dict: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        """Initialize the parser with the config.""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return {} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def parse_tups( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        self, filepath: Path, errors: str = "ignore" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    ) -> List[Tuple[Optional[str], str]]: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        """Parse file into tuples.""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        with open(filepath, "r", encoding="utf-8") as f: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            content = f.read() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if self._remove_hyperlinks: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            content = self.remove_hyperlinks(content) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if self._remove_images: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            content = self.remove_images(content) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        markdown_tups = self.markdown_to_tups(content) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return markdown_tups 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def parse_file( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        self, filepath: Path, errors: str = "ignore" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    ) -> Union[str, List[str]]: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        """Parse file into string.""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        tups = self.parse_tups(filepath, errors=errors) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        results = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # TODO: don't include headers right now 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        for header, value in tups: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if header is None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                results.append(value) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                results.append(f"\n\n{header}\n{value}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return results 
			 |