AIdeaText commited on
Commit
0e46985
1 Parent(s): 1d9ee0d

Update modules/syntax_analysis.py

Browse files
Files changed (1) hide show
  1. modules/syntax_analysis.py +63 -31
modules/syntax_analysis.py CHANGED
@@ -5,12 +5,7 @@ import networkx as nx
5
  import matplotlib.pyplot as plt
6
  from collections import Counter
7
 
8
- @st.cache_resource
9
- def load_spacy_model():
10
- return spacy.load("es_core_news_lg")
11
-
12
- # Load spaCy model
13
- nlp = spacy.load("es_core_news_lg")
14
 
15
  # Define colors for grammatical categories
16
  POS_COLORS = {
@@ -33,28 +28,66 @@ POS_COLORS = {
33
  }
34
 
35
  POS_TRANSLATIONS = {
36
- 'ADJ': 'Adjetivo',
37
- 'ADP': 'Advposición',
38
- 'ADV': 'Adverbio',
39
- 'AUX': 'Auxiliar',
40
- 'CCONJ': 'Conjunción Coordinante',
41
- 'DET': 'Determinante',
42
- 'INTJ': 'Interjección',
43
- 'NOUN': 'Sustantivo',
44
- 'NUM': 'Número',
45
- 'PART': 'Partícula',
46
- 'PRON': 'Pronombre',
47
- 'PROPN': 'Nombre Propio',
48
- 'SCONJ': 'Conjunción Subordinante',
49
- 'SYM': 'Símbolo',
50
- 'VERB': 'Verbo',
51
- 'X': 'Otro',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  }
53
 
54
  def count_pos(doc):
55
  return Counter(token.pos_ for token in doc if token.pos_ != 'PUNCT')
56
 
57
- def create_syntax_graph(doc):
58
  G = nx.DiGraph()
59
  pos_counts = count_pos(doc)
60
  word_nodes = {}
@@ -69,7 +102,7 @@ def create_syntax_graph(doc):
69
  color = POS_COLORS.get(token.pos_, '#FFFFFF')
70
  word_colors[lower_text] = color
71
  G.add_node(node_id,
72
- label=f"{token.text}\n[{POS_TRANSLATIONS.get(token.pos_, token.pos_)}]",
73
  pos=token.pos_,
74
  size=pos_counts[token.pos_] * 500,
75
  color=color)
@@ -81,8 +114,8 @@ def create_syntax_graph(doc):
81
 
82
  return G, word_colors
83
 
84
- def visualize_syntax_graph(doc):
85
- G, word_colors = create_syntax_graph(doc)
86
 
87
  plt.figure(figsize=(20, 15))
88
  pos = nx.spring_layout(G, k=2, iterations=100)
@@ -97,24 +130,23 @@ def visualize_syntax_graph(doc):
97
  edge_labels = nx.get_edge_attributes(G, 'label')
98
  nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)
99
 
100
- plt.title("Análisis Sintáctico")
101
  plt.axis('off')
102
 
103
- legend_elements = [plt.Rectangle((0,0),1,1, facecolor=color, edgecolor='none', label=f"{POS_TRANSLATIONS[pos]} ({count_pos(doc)[pos]})")
104
  for pos, color in POS_COLORS.items() if pos in set(nx.get_node_attributes(G, 'pos').values())]
105
  plt.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5))
106
 
107
  return plt
108
 
109
- def visualize_syntax(text):
110
  max_tokens = 5000
111
  doc = nlp(text)
112
  if len(doc) > max_tokens:
113
  doc = nlp(text[:max_tokens])
114
  print(f"Warning: The input text is too long. Only the first {max_tokens} tokens will be visualized.")
115
- return visualize_syntax_graph(doc)
116
 
117
- # Repeated words colors
118
  def get_repeated_words_colors(doc):
119
  word_counts = Counter(token.text.lower() for token in doc if token.pos_ != 'PUNCT')
120
  repeated_words = {word: count for word, count in word_counts.items() if count > 1}
 
5
  import matplotlib.pyplot as plt
6
  from collections import Counter
7
 
8
+ # Remove the global nlp model loading
 
 
 
 
 
9
 
10
  # Define colors for grammatical categories
11
  POS_COLORS = {
 
28
  }
29
 
30
  POS_TRANSLATIONS = {
31
+ 'es': {
32
+ 'ADJ': 'Adjetivo',
33
+ 'ADP': 'Adposición',
34
+ 'ADV': 'Adverbio',
35
+ 'AUX': 'Auxiliar',
36
+ 'CCONJ': 'Conjunción Coordinante',
37
+ 'DET': 'Determinante',
38
+ 'INTJ': 'Interjección',
39
+ 'NOUN': 'Sustantivo',
40
+ 'NUM': 'Número',
41
+ 'PART': 'Partícula',
42
+ 'PRON': 'Pronombre',
43
+ 'PROPN': 'Nombre Propio',
44
+ 'SCONJ': 'Conjunción Subordinante',
45
+ 'SYM': 'Símbolo',
46
+ 'VERB': 'Verbo',
47
+ 'X': 'Otro',
48
+ },
49
+ 'en': {
50
+ 'ADJ': 'Adjective',
51
+ 'ADP': 'Adposition',
52
+ 'ADV': 'Adverb',
53
+ 'AUX': 'Auxiliary',
54
+ 'CCONJ': 'Coordinating Conjunction',
55
+ 'DET': 'Determiner',
56
+ 'INTJ': 'Interjection',
57
+ 'NOUN': 'Noun',
58
+ 'NUM': 'Number',
59
+ 'PART': 'Particle',
60
+ 'PRON': 'Pronoun',
61
+ 'PROPN': 'Proper Noun',
62
+ 'SCONJ': 'Subordinating Conjunction',
63
+ 'SYM': 'Symbol',
64
+ 'VERB': 'Verb',
65
+ 'X': 'Other',
66
+ },
67
+ 'fr': {
68
+ 'ADJ': 'Adjectif',
69
+ 'ADP': 'Adposition',
70
+ 'ADV': 'Adverbe',
71
+ 'AUX': 'Auxiliaire',
72
+ 'CCONJ': 'Conjonction de Coordination',
73
+ 'DET': 'Déterminant',
74
+ 'INTJ': 'Interjection',
75
+ 'NOUN': 'Nom',
76
+ 'NUM': 'Nombre',
77
+ 'PART': 'Particule',
78
+ 'PRON': 'Pronom',
79
+ 'PROPN': 'Nom Propre',
80
+ 'SCONJ': 'Conjonction de Subordination',
81
+ 'SYM': 'Symbole',
82
+ 'VERB': 'Verbe',
83
+ 'X': 'Autre',
84
+ }
85
  }
86
 
87
  def count_pos(doc):
88
  return Counter(token.pos_ for token in doc if token.pos_ != 'PUNCT')
89
 
90
+ def create_syntax_graph(doc, lang):
91
  G = nx.DiGraph()
92
  pos_counts = count_pos(doc)
93
  word_nodes = {}
 
102
  color = POS_COLORS.get(token.pos_, '#FFFFFF')
103
  word_colors[lower_text] = color
104
  G.add_node(node_id,
105
+ label=f"{token.text}\n[{POS_TRANSLATIONS[lang].get(token.pos_, token.pos_)}]",
106
  pos=token.pos_,
107
  size=pos_counts[token.pos_] * 500,
108
  color=color)
 
114
 
115
  return G, word_colors
116
 
117
+ def visualize_syntax_graph(doc, lang):
118
+ G, word_colors = create_syntax_graph(doc, lang)
119
 
120
  plt.figure(figsize=(20, 15))
121
  pos = nx.spring_layout(G, k=2, iterations=100)
 
130
  edge_labels = nx.get_edge_attributes(G, 'label')
131
  nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)
132
 
133
+ plt.title("Syntactic Analysis" if lang == 'en' else "Analyse Syntaxique" if lang == 'fr' else "Análisis Sintáctico")
134
  plt.axis('off')
135
 
136
+ legend_elements = [plt.Rectangle((0,0),1,1, facecolor=color, edgecolor='none', label=f"{POS_TRANSLATIONS[lang][pos]} ({count_pos(doc)[pos]})")
137
  for pos, color in POS_COLORS.items() if pos in set(nx.get_node_attributes(G, 'pos').values())]
138
  plt.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5))
139
 
140
  return plt
141
 
142
+ def visualize_syntax(text, nlp, lang):
143
  max_tokens = 5000
144
  doc = nlp(text)
145
  if len(doc) > max_tokens:
146
  doc = nlp(text[:max_tokens])
147
  print(f"Warning: The input text is too long. Only the first {max_tokens} tokens will be visualized.")
148
+ return visualize_syntax_graph(doc, lang)
149
 
 
150
  def get_repeated_words_colors(doc):
151
  word_counts = Counter(token.text.lower() for token in doc if token.pos_ != 'PUNCT')
152
  repeated_words = {word: count for word, count in word_counts.items() if count > 1}