1
+ import markdown
2
+ from bs4 import BeautifulSoup , NavigableString , Tag
3
+ import string
4
+
5
+
6
+ class Helper :
7
+ @staticmethod
8
+ def parse_gfm_section (html_content ):
9
+ """
10
+ Parse a GitHub-Flavored Markdown section containing a table and surrounding content.
11
+ Returns a dict with "before_html", "columns", "rows_html", and "after_html".
12
+ """
13
+ html = markdown .markdown (html_content , extensions = ['extra' ])
14
+ soup = BeautifulSoup (html , "html.parser" )
15
+
16
+ table = soup .find ('table' )
17
+ if not table :
18
+ # If no table, treat entire content as before_html
19
+ return {"before_html" : html , "columns" : [], "rows_html" : [], "after_html" : '' }
20
+
21
+ # Collect HTML before the table
22
+ before_parts = [str (elem ) for elem in table .find_previous_siblings ()]
23
+ before_html = '' .join (reversed (before_parts ))
24
+
25
+ # Collect HTML after the table
26
+ after_parts = [str (elem ) for elem in table .find_next_siblings ()]
27
+ after_html = '' .join (after_parts )
28
+
29
+ # Extract table headers
30
+ headers = [th .get_text (strip = True ) for th in table .find_all ('th' )]
31
+
32
+ # Extract table rows (skip header)
33
+ rows_html = []
34
+ for tr in table .find_all ('tr' )[1 :]:
35
+ cells = [str (td ) for td in tr .find_all ('td' )]
36
+ rows_html .append (cells )
37
+
38
+ return {
39
+ "before_html" : before_html ,
40
+ "columns" : headers ,
41
+ "rows_html" : rows_html ,
42
+ "after_html" : after_html
43
+ }
44
+
45
+ @staticmethod
46
+ def parse_cell (html_td ):
47
+ """Convert a table cell HTML into plain text or a dict for links/images."""
48
+ soup = BeautifulSoup (html_td , "html.parser" )
49
+ a = soup .find ('a' )
50
+ if a :
51
+ cell = {"url" : a .get ('href' , '' )}
52
+ img = a .find ('img' )
53
+ if img :
54
+ cell .update ({
55
+ "img_src" : img .get ('src' , '' ),
56
+ "title" : img .get ('title' , '' ),
57
+ "link_text" : a .get_text (strip = True )
58
+ })
59
+ else :
60
+ cell ["link_text" ] = a .get_text (strip = True )
61
+ return cell
62
+ return soup .get_text (strip = True )
63
+
64
+ @staticmethod
65
+ def parse_html_parts (html_fragment ):
66
+ """
67
+ Convert an HTML fragment into a list of parts.
68
+ Each part is either:
69
+ - {"text": "..."}
70
+ - {"link": "url", "text": "..."}
71
+ - {"img_src": "url", "alt": "...", "title": "..."}
72
+ """
73
+ soup = BeautifulSoup (html_fragment , 'html.parser' )
74
+ parts = []
75
+
76
+ def handle_element (elem ):
77
+ if isinstance (elem , NavigableString ):
78
+ text = str (elem ).strip ()
79
+ if text and not all (ch in string .punctuation for ch in text ):
80
+ parts .append ({"text" : text })
81
+ elif isinstance (elem , Tag ):
82
+ if elem .name == 'a' :
83
+ href = elem .get ('href' , '' )
84
+ txt = elem .get_text (strip = True )
85
+ parts .append ({"link" : href , "text" : txt })
86
+ elif elem .name == 'img' :
87
+ parts .append ({
88
+ "img_src" : elem .get ('src' , '' ),
89
+ "alt" : elem .get ('alt' , '' ),
90
+ "title" : elem .get ('title' , '' )
91
+ })
92
+ else :
93
+ # Recurse into children for nested tags
94
+ for child in elem .children :
95
+ handle_element (child )
96
+
97
+ for element in soup .contents :
98
+ handle_element (element )
99
+
100
+ return parts
101
+
102
+ @staticmethod
103
+ def section_to_json (section_result ):
104
+ """
105
+ Convert a parsed section into structured JSON.
106
+ Returns {"before": [...], "table": [...], "after": [...]}.
107
+ """
108
+ # Build JSON rows for the table
109
+ table_rows = []
110
+ cols = section_result .get ('columns' , [])
111
+ for row_html in section_result .get ('rows_html' , []):
112
+ cells = [Helper .parse_cell (cell_html ) for cell_html in row_html ]
113
+ table_rows .append (dict (zip (cols , cells )))
114
+
115
+ return {
116
+ "before" : Helper .parse_html_parts (section_result .get ('before_html' , '' )),
117
+ "table" : table_rows ,
118
+ "after" : Helper .parse_html_parts (section_result .get ('after_html' , '' ))
119
+ }
0 commit comments