This repository was archived by the owner on Mar 8, 2020. It is now read-only.
Description Hi,
I found that python-driver has lack of position information for several types of tokens.
import bblfsh
client = bblfsh .BblfshClient ("0.0.0.0:9432" )
file_loc = "location/of/file.py"
# read content
with open (file_loc , "r" ) as f :
content = f .read ()
# extract uast
uast = client .parse (file_loc ).uast
# select nodes with tokens and sort them by position
nodes = []
for node in bblfsh .iterator (uast , bblfsh .TreeOrder .PRE_ORDER ):
if node .token :
nodes .append (node )
nodes = list (sorted (nodes , key = lambda n : n .start_position .offset ))
# print token position, token, select source by position information
for n in nodes :
print (n .start_position .offset , n .token ,
content [n .start_position .offset :n .start_position .offset + len (n .token )],
content [n .start_position .offset :n .end_position .offset + 1 ],
sep = "|" )
The source code I used is in details
Details
import argparse
import os
import tempfile
import unittest
import sourced .ml .tests .models as paths
from sourced .ml .models import Topics
from sourced .ml .cmd import bigartm2asdf
class TopicsTests (unittest .TestCase ):
def setUp (self ):
self .model = Topics ().load (source = paths .TOPICS )
def test_dump (self ):
res = self .model .dump ()
self .assertEqual (res , """320 topics, 1000 tokens
First 10 tokens: ['ulcancel', 'domainlin', 'trudi', 'fncreateinstancedbaselin', 'wbnz', 'lmultiplicand', 'otronumero', 'qxln', 'gvgq', 'polaroidish']
Topics: unlabeled
non-zero elements: 6211 (0.019409)""" ) # noqa
def test_props (self ):
self .assertEqual (len (self .model ), 320 )
self .assertEqual (len (self .model .tokens ), 1000 )
self .assertIsNone (self .model .topics )
zt = self .model [0 ]
self .assertEqual (len (zt ), 8 )
self .assertEqual (zt [0 ][0 ], "olcustom" )
self .assertAlmostEqual (zt [0 ][1 ], 1.23752e-06 , 6 )
def test_label (self ):
with self .assertRaises (ValueError ):
self .model .label_topics ([1 , 2 , 3 ])
with self .assertRaises (TypeError ):
self .model .label_topics (list (range (320 )))
self .model .label_topics ([str (i ) for i in range (320 )])
self .assertEqual (self .model .topics [0 ], "0" )
def test_save (self ):
with tempfile .NamedTemporaryFile (prefix = "sourced.ml-topics-test-" ) as f :
self .model .save (f .name )
new = Topics ().load (f .name )
self .assertEqual (self .model .tokens , new .tokens )
self .assertEqual ((self .model .matrix != new .matrix ).getnnz (), 0 )
def test_bigartm2asdf (self ):
with tempfile .NamedTemporaryFile (prefix = "sourced.ml-topics-test-" ) as f :
args = argparse .Namespace (
input = os .path .join (os .path .dirname (__file__ ), paths .TOPICS_SRC ),
output = f .name )
bigartm2asdf (args )
model = Topics ().load (f .name )
self .assertEqual (len (model ), 320 )
self .assertEqual (len (model .tokens ), 1000 )
if __name__ == "__main__" :
unittest .main ()
As result we may notice seral tokens without position information:
0|argparse|import a|i
0|os|im|i
0|tempfile|import a|i
0|unittest|import a|i
0|sourced.ml.tests.models|import argparse
import |i
0|paths|impor|i
0|sourced.ml.models|import argparse
i|i
0|Topics|import|i
0|sourced.ml.cmd|import argpars|i
0|bigartm2asdf|import argpa|i
0|source|import|i
0|!=|im|i
0|prefix|import|i
0|input|impor|i
0|output|import|i
0|prefix|import|i
0|==|im|i
184|TopicsTests|TopicsTests|TopicsTests
some of them are imports like
0|argparse|import a|i
0|os|im|i
some operators
some arguments
0|source|import|i
0|prefix|import|i
0|input|impor|i
0|output|import|i
0|prefix|import|i
Reactions are currently unavailable
Hi,
I found that python-driver has lack of position information for several types of tokens.
The source code I used is in details
Details
As result we may notice seral tokens without position information:
some of them are imports like
some operators
some arguments