AIdeaText commited on
Commit
e031c18
1 Parent(s): 5b5c9f7

Create syntax_analysis.py

Browse files
Files changed (1) hide show
  1. modules/syntax_analysis.py +132 -0
modules/syntax_analysis.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax_analysis.py
2
+ import spacy
3
+ import networkx as nx
4
+ import matplotlib.pyplot as plt
5
+ from collections import Counter
6
+
7
+ # Load spaCy model
8
+ nlp = spacy.load("es_core_news_lg")
9
+
10
+ # Define colors for grammatical categories
11
+ POS_COLORS = {
12
+ 'ADJ': '#FFA07A', # Light Salmon
13
+ 'ADP': '#98FB98', # Pale Green
14
+ 'ADV': '#87CEFA', # Light Sky Blue
15
+ 'AUX': '#DDA0DD', # Plum
16
+ 'CCONJ': '#F0E68C', # Khaki
17
+ 'DET': '#FFB6C1', # Light Pink
18
+ 'INTJ': '#FF6347', # Tomato
19
+ 'NOUN': '#90EE90', # Light Green
20
+ 'NUM': '#FAFAD2', # Light Goldenrod Yellow
21
+ 'PART': '#D3D3D3', # Light Gray
22
+ 'PRON': '#FFA500', # Orange
23
+ 'PROPN': '#20B2AA', # Light Sea Green
24
+ 'SCONJ': '#DEB887', # Burlywood
25
+ 'SYM': '#7B68EE', # Medium Slate Blue
26
+ 'VERB': '#FF69B4', # Hot Pink
27
+ 'X': '#A9A9A9', # Dark Gray
28
+ }
29
+
30
+ POS_TRANSLATIONS = {
31
+ 'ADJ': 'Adjetivo',
32
+ 'ADP': 'Advposición',
33
+ 'ADV': 'Adverbio',
34
+ 'AUX': 'Auxiliar',
35
+ 'CCONJ': 'Conjunción Coordinante',
36
+ 'DET': 'Determinante',
37
+ 'INTJ': 'Interjección',
38
+ 'NOUN': 'Sustantivo',
39
+ 'NUM': 'Número',
40
+ 'PART': 'Partícula',
41
+ 'PRON': 'Pronombre',
42
+ 'PROPN': 'Nombre Propio',
43
+ 'SCONJ': 'Conjunción Subordinante',
44
+ 'SYM': 'Símbolo',
45
+ 'VERB': 'Verbo',
46
+ 'X': 'Otro',
47
+ }
48
+
49
+ def count_pos(doc):
50
+ return Counter(token.pos_ for token in doc if token.pos_ != 'PUNCT')
51
+
52
+ def create_syntax_graph(doc):
53
+ G = nx.DiGraph()
54
+ pos_counts = count_pos(doc)
55
+ word_nodes = {}
56
+ word_colors = {}
57
+
58
+ for token in doc:
59
+ if token.pos_ != 'PUNCT':
60
+ lower_text = token.text.lower()
61
+ if lower_text not in word_nodes:
62
+ node_id = len(word_nodes)
63
+ word_nodes[lower_text] = node_id
64
+ color = POS_COLORS.get(token.pos_, '#FFFFFF')
65
+ word_colors[lower_text] = color
66
+ G.add_node(node_id,
67
+ label=f"{token.text}\n[{POS_TRANSLATIONS.get(token.pos_, token.pos_)}]",
68
+ pos=token.pos_,
69
+ size=pos_counts[token.pos_] * 500,
70
+ color=color)
71
+
72
+ if token.dep_ != "ROOT" and token.head.pos_ != 'PUNCT':
73
+ head_id = word_nodes.get(token.head.text.lower())
74
+ if head_id is not None:
75
+ G.add_edge(head_id, word_nodes[lower_text], label=token.dep_)
76
+
77
+ return G, word_colors
78
+
79
+ def visualize_syntax_graph(doc):
80
+ G, word_colors = create_syntax_graph(doc)
81
+
82
+ plt.figure(figsize=(20, 15))
83
+ pos = nx.spring_layout(G, k=2, iterations=100)
84
+
85
+ node_colors = [data['color'] for _, data in G.nodes(data=True)]
86
+ node_sizes = [data['size'] for _, data in G.nodes(data=True)]
87
+
88
+ nx.draw(G, pos, with_labels=False, node_color=node_colors, node_size=node_sizes, arrows=True)
89
+
90
+ nx.draw_networkx_labels(G, pos, {node: data['label'] for node, data in G.nodes(data=True)}, font_size=8)
91
+
92
+ edge_labels = nx.get_edge_attributes(G, 'label')
93
+ nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)
94
+
95
+ plt.title("Análisis Sintáctico")
96
+ plt.axis('off')
97
+
98
+ legend_elements = [plt.Rectangle((0,0),1,1, facecolor=color, edgecolor='none', label=f"{POS_TRANSLATIONS[pos]} ({count_pos(doc)[pos]})")
99
+ for pos, color in POS_COLORS.items() if pos in set(nx.get_node_attributes(G, 'pos').values())]
100
+ plt.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5))
101
+
102
+ return plt
103
+
104
+ def visualize_syntax(text):
105
+ max_tokens = 5000
106
+ doc = nlp(text)
107
+ if len(doc) > max_tokens:
108
+ doc = nlp(text[:max_tokens])
109
+ print(f"Warning: The input text is too long. Only the first {max_tokens} tokens will be visualized.")
110
+ return visualize_syntax_graph(doc)
111
+
112
+ # Repeated words colors
113
+ def get_repeated_words_colors(doc):
114
+ word_counts = Counter(token.text.lower() for token in doc if token.pos_ != 'PUNCT')
115
+ repeated_words = {word: count for word, count in word_counts.items() if count > 1}
116
+
117
+ word_colors = {}
118
+ for token in doc:
119
+ if token.text.lower() in repeated_words:
120
+ word_colors[token.text.lower()] = POS_COLORS.get(token.pos_, '#FFFFFF')
121
+
122
+ return word_colors
123
+
124
+ def highlight_repeated_words(doc, word_colors):
125
+ highlighted_text = []
126
+ for token in doc:
127
+ if token.text.lower() in word_colors:
128
+ color = word_colors[token.text.lower()]
129
+ highlighted_text.append(f'<span style="background-color: {color};">{token.text}</span>')
130
+ else:
131
+ highlighted_text.append(token.text)
132
+ return ' '.join(highlighted_text)