uast.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511
  1. package hercules
  2. import (
  3. "bytes"
  4. "context"
  5. "errors"
  6. "fmt"
  7. "io"
  8. goioutil "io/ioutil"
  9. "os"
  10. "path"
  11. "runtime"
  12. "strings"
  13. "sync"
  14. "time"
  15. "github.com/gogo/protobuf/proto"
  16. "github.com/jeffail/tunny"
  17. "gopkg.in/bblfsh/client-go.v2"
  18. "gopkg.in/bblfsh/sdk.v1/protocol"
  19. "gopkg.in/bblfsh/sdk.v1/uast"
  20. "gopkg.in/src-d/enry.v1"
  21. "gopkg.in/src-d/go-git.v4"
  22. "gopkg.in/src-d/go-git.v4/plumbing"
  23. "gopkg.in/src-d/go-git.v4/plumbing/object"
  24. "gopkg.in/src-d/go-git.v4/utils/ioutil"
  25. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  26. "gopkg.in/src-d/hercules.v3/pb"
  27. )
  28. type UASTExtractor struct {
  29. Endpoint string
  30. Context func() (context.Context, context.CancelFunc)
  31. PoolSize int
  32. Languages map[string]bool
  33. FailOnErrors bool
  34. ProcessedFiles map[string]int
  35. clients []*bblfsh.Client
  36. pool *tunny.WorkPool
  37. }
  38. const (
  39. UAST_EXTRACTION_SKIPPED = -(1 << 31)
  40. ConfigUASTEndpoint = "ConfigUASTEndpoint"
  41. ConfigUASTTimeout = "ConfigUASTTimeout"
  42. ConfigUASTPoolSize = "ConfigUASTPoolSize"
  43. ConfigUASTFailOnErrors = "ConfigUASTFailOnErrors"
  44. ConfigUASTLanguages = "ConfigUASTLanguages"
  45. FeatureUast = "uast"
  46. DependencyUasts = "uasts"
  47. )
  48. type uastTask struct {
  49. Client *bblfsh.Client
  50. Lock *sync.RWMutex
  51. Dest map[plumbing.Hash]*uast.Node
  52. File *object.File
  53. Errors *[]error
  54. Status chan int
  55. }
  56. type worker struct {
  57. Client *bblfsh.Client
  58. Job func(interface{}) interface{}
  59. }
  60. func (w worker) TunnyReady() bool {
  61. return true
  62. }
  63. func (w worker) TunnyJob(data interface{}) interface{} {
  64. task := data.(uastTask)
  65. task.Client = w.Client
  66. return w.Job(task)
  67. }
  68. func (exr *UASTExtractor) Name() string {
  69. return "UAST"
  70. }
  71. func (exr *UASTExtractor) Provides() []string {
  72. arr := [...]string{DependencyUasts}
  73. return arr[:]
  74. }
  75. func (exr *UASTExtractor) Requires() []string {
  76. arr := [...]string{DependencyTreeChanges, DependencyBlobCache}
  77. return arr[:]
  78. }
  79. func (exr *UASTExtractor) Features() []string {
  80. arr := [...]string{FeatureUast}
  81. return arr[:]
  82. }
  83. func (exr *UASTExtractor) ListConfigurationOptions() []ConfigurationOption {
  84. options := [...]ConfigurationOption{{
  85. Name: ConfigUASTEndpoint,
  86. Description: "How many days there are in a single band.",
  87. Flag: "bblfsh",
  88. Type: StringConfigurationOption,
  89. Default: "0.0.0.0:9432"}, {
  90. Name: ConfigUASTTimeout,
  91. Description: "Babelfish's server timeout in seconds.",
  92. Flag: "bblfsh-timeout",
  93. Type: IntConfigurationOption,
  94. Default: 20}, {
  95. Name: ConfigUASTPoolSize,
  96. Description: "Number of goroutines to extract UASTs.",
  97. Flag: "bblfsh-pool-size",
  98. Type: IntConfigurationOption,
  99. Default: runtime.NumCPU()}, {
  100. Name: ConfigUASTFailOnErrors,
  101. Description: "Panic if there is a UAST extraction error.",
  102. Flag: "bblfsh-fail-on-error",
  103. Type: BoolConfigurationOption,
  104. Default: false}, {
  105. Name: ConfigUASTLanguages,
  106. Description: "Programming languages from which to extract UASTs. Separated by comma \",\".",
  107. Flag: "languages",
  108. Type: StringConfigurationOption,
  109. Default: "Python,Java"},
  110. }
  111. return options[:]
  112. }
  113. func (exr *UASTExtractor) Configure(facts map[string]interface{}) {
  114. if val, exists := facts[ConfigUASTEndpoint].(string); exists {
  115. exr.Endpoint = val
  116. }
  117. if val, exists := facts[ConfigUASTTimeout].(int); exists {
  118. exr.Context = func() (context.Context, context.CancelFunc) {
  119. return context.WithTimeout(context.Background(),
  120. time.Duration(val)*time.Second)
  121. }
  122. }
  123. if val, exists := facts[ConfigUASTPoolSize].(int); exists {
  124. exr.PoolSize = val
  125. }
  126. if val, exists := facts[ConfigUASTLanguages].(string); exists {
  127. exr.Languages = map[string]bool{}
  128. for _, lang := range strings.Split(val, ",") {
  129. exr.Languages[strings.TrimSpace(lang)] = true
  130. }
  131. }
  132. if val, exists := facts[ConfigUASTFailOnErrors].(bool); exists {
  133. exr.FailOnErrors = val
  134. }
  135. }
  136. func (exr *UASTExtractor) Initialize(repository *git.Repository) {
  137. if exr.Context == nil {
  138. exr.Context = func() (context.Context, context.CancelFunc) {
  139. return context.Background(), nil
  140. }
  141. }
  142. poolSize := exr.PoolSize
  143. if poolSize == 0 {
  144. poolSize = runtime.NumCPU()
  145. }
  146. var err error
  147. exr.clients = make([]*bblfsh.Client, poolSize)
  148. for i := 0; i < poolSize; i++ {
  149. client, err := bblfsh.NewClient(exr.Endpoint)
  150. if err != nil {
  151. panic(err)
  152. }
  153. exr.clients[i] = client
  154. }
  155. if exr.pool != nil {
  156. exr.pool.Close()
  157. }
  158. workers := make([]tunny.TunnyWorker, poolSize)
  159. for i := 0; i < poolSize; i++ {
  160. workers[i] = worker{Client: exr.clients[i], Job: exr.extractTask}
  161. }
  162. exr.pool, err = tunny.CreateCustomPool(workers).Open()
  163. if err != nil {
  164. panic(err)
  165. }
  166. exr.ProcessedFiles = map[string]int{}
  167. if exr.Languages == nil {
  168. exr.Languages = map[string]bool{}
  169. }
  170. }
  171. func (exr *UASTExtractor) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  172. cache := deps[DependencyBlobCache].(map[plumbing.Hash]*object.Blob)
  173. treeDiffs := deps[DependencyTreeChanges].(object.Changes)
  174. uasts := map[plumbing.Hash]*uast.Node{}
  175. lock := sync.RWMutex{}
  176. errs := make([]error, 0)
  177. status := make(chan int)
  178. pending := 0
  179. submit := func(change *object.Change) {
  180. {
  181. reader, err := cache[change.To.TreeEntry.Hash].Reader()
  182. if err != nil {
  183. errs = append(errs, err)
  184. return
  185. }
  186. defer ioutil.CheckClose(reader, &err)
  187. buf := new(bytes.Buffer)
  188. if _, err := buf.ReadFrom(reader); err != nil {
  189. errs = append(errs, err)
  190. return
  191. }
  192. lang := enry.GetLanguage(change.To.Name, buf.Bytes())
  193. if _, exists := exr.Languages[lang]; !exists {
  194. exr.ProcessedFiles[change.To.Name] = UAST_EXTRACTION_SKIPPED
  195. return
  196. }
  197. exr.ProcessedFiles[change.To.Name]++
  198. }
  199. pending++
  200. exr.pool.SendWorkAsync(uastTask{
  201. Lock: &lock,
  202. Dest: uasts,
  203. File: &object.File{Name: change.To.Name, Blob: *cache[change.To.TreeEntry.Hash]},
  204. Errors: &errs, Status: status}, nil)
  205. }
  206. for _, change := range treeDiffs {
  207. action, err := change.Action()
  208. if err != nil {
  209. return nil, err
  210. }
  211. switch action {
  212. case merkletrie.Insert:
  213. submit(change)
  214. case merkletrie.Delete:
  215. continue
  216. case merkletrie.Modify:
  217. submit(change)
  218. }
  219. }
  220. for i := 0; i < pending; i++ {
  221. _ = <-status
  222. }
  223. if len(errs) > 0 {
  224. msgs := make([]string, len(errs))
  225. for i, err := range errs {
  226. msgs[i] = err.Error()
  227. }
  228. joined := strings.Join(msgs, "\n")
  229. if exr.FailOnErrors {
  230. return nil, errors.New(joined)
  231. } else {
  232. fmt.Fprintln(os.Stderr, joined)
  233. }
  234. }
  235. return map[string]interface{}{DependencyUasts: uasts}, nil
  236. }
  237. func (exr *UASTExtractor) extractUAST(
  238. client *bblfsh.Client, file *object.File) (*uast.Node, error) {
  239. request := client.NewParseRequest()
  240. contents, err := file.Contents()
  241. if err != nil {
  242. return nil, err
  243. }
  244. request.Content(contents)
  245. request.Filename(file.Name)
  246. ctx, cancel := exr.Context()
  247. if cancel != nil {
  248. defer cancel()
  249. }
  250. response, err := request.DoWithContext(ctx)
  251. if err != nil {
  252. if strings.Contains("missing driver", err.Error()) {
  253. return nil, nil
  254. }
  255. return nil, err
  256. }
  257. if response.Status != protocol.Ok {
  258. return nil, errors.New(strings.Join(response.Errors, "\n"))
  259. }
  260. if err != nil {
  261. return nil, err
  262. }
  263. return response.UAST, nil
  264. }
  265. func (exr *UASTExtractor) extractTask(data interface{}) interface{} {
  266. task := data.(uastTask)
  267. defer func() { task.Status <- 0 }()
  268. node, err := exr.extractUAST(task.Client, task.File)
  269. task.Lock.Lock()
  270. defer task.Lock.Unlock()
  271. if err != nil {
  272. *task.Errors = append(*task.Errors, errors.New(task.File.Name+": "+err.Error()))
  273. return nil
  274. }
  275. if node != nil {
  276. task.Dest[task.File.Hash] = node
  277. }
  278. return nil
  279. }
  280. type UASTChange struct {
  281. Before *uast.Node
  282. After *uast.Node
  283. Change *object.Change
  284. }
  285. const (
  286. DependencyUastChanges = "changed_uasts"
  287. )
  288. type UASTChanges struct {
  289. cache map[plumbing.Hash]*uast.Node
  290. }
  291. func (uc *UASTChanges) Name() string {
  292. return "UASTChanges"
  293. }
  294. func (uc *UASTChanges) Provides() []string {
  295. arr := [...]string{DependencyUastChanges}
  296. return arr[:]
  297. }
  298. func (uc *UASTChanges) Requires() []string {
  299. arr := [...]string{DependencyUasts, DependencyTreeChanges}
  300. return arr[:]
  301. }
  302. func (uc *UASTChanges) Features() []string {
  303. arr := [...]string{FeatureUast}
  304. return arr[:]
  305. }
  306. func (uc *UASTChanges) ListConfigurationOptions() []ConfigurationOption {
  307. return []ConfigurationOption{}
  308. }
  309. func (uc *UASTChanges) Configure(facts map[string]interface{}) {}
  310. func (uc *UASTChanges) Initialize(repository *git.Repository) {
  311. uc.cache = map[plumbing.Hash]*uast.Node{}
  312. }
  313. func (uc *UASTChanges) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  314. uasts := deps[DependencyUasts].(map[plumbing.Hash]*uast.Node)
  315. treeDiffs := deps[DependencyTreeChanges].(object.Changes)
  316. commit := make([]UASTChange, 0, len(treeDiffs))
  317. for _, change := range treeDiffs {
  318. action, err := change.Action()
  319. if err != nil {
  320. return nil, err
  321. }
  322. switch action {
  323. case merkletrie.Insert:
  324. hashTo := change.To.TreeEntry.Hash
  325. uastTo := uasts[hashTo]
  326. commit = append(commit, UASTChange{Before: nil, After: uastTo, Change: change})
  327. uc.cache[hashTo] = uastTo
  328. case merkletrie.Delete:
  329. hashFrom := change.From.TreeEntry.Hash
  330. commit = append(commit, UASTChange{Before: uc.cache[hashFrom], After: nil, Change: change})
  331. delete(uc.cache, hashFrom)
  332. case merkletrie.Modify:
  333. hashFrom := change.From.TreeEntry.Hash
  334. hashTo := change.To.TreeEntry.Hash
  335. uastTo := uasts[hashTo]
  336. commit = append(commit, UASTChange{Before: uc.cache[hashFrom], After: uastTo, Change: change})
  337. delete(uc.cache, hashFrom)
  338. uc.cache[hashTo] = uastTo
  339. }
  340. }
  341. return map[string]interface{}{DependencyUastChanges: commit}, nil
  342. }
  343. type UASTChangesSaver struct {
  344. // OutputPath points to the target directory with UASTs
  345. OutputPath string
  346. repository *git.Repository
  347. result [][]UASTChange
  348. }
  349. const (
  350. ConfigUASTChangesSaverOutputPath = "UASTChangesSaver.OutputPath"
  351. )
  352. func (saver *UASTChangesSaver) Name() string {
  353. return "UASTChangesSaver"
  354. }
  355. func (saver *UASTChangesSaver) Provides() []string {
  356. return []string{}
  357. }
  358. func (saver *UASTChangesSaver) Requires() []string {
  359. arr := [...]string{DependencyUastChanges}
  360. return arr[:]
  361. }
  362. func (saver *UASTChangesSaver) Features() []string {
  363. arr := [...]string{FeatureUast}
  364. return arr[:]
  365. }
  366. func (saver *UASTChangesSaver) ListConfigurationOptions() []ConfigurationOption {
  367. options := [...]ConfigurationOption{{
  368. Name: ConfigUASTChangesSaverOutputPath,
  369. Description: "The target directory where to store the changed UAST files.",
  370. Flag: "changed-uast-dir",
  371. Type: StringConfigurationOption,
  372. Default: "."},
  373. }
  374. return options[:]
  375. }
  376. func (saver *UASTChangesSaver) Flag() string {
  377. return "dump-uast-changes"
  378. }
  379. func (saver *UASTChangesSaver) Configure(facts map[string]interface{}) {
  380. if val, exists := facts[ConfigUASTChangesSaverOutputPath]; exists {
  381. saver.OutputPath = val.(string)
  382. }
  383. }
  384. func (saver *UASTChangesSaver) Initialize(repository *git.Repository) {
  385. saver.repository = repository
  386. saver.result = [][]UASTChange{}
  387. }
  388. func (saver *UASTChangesSaver) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  389. changes := deps[DependencyUastChanges].([]UASTChange)
  390. saver.result = append(saver.result, changes)
  391. return nil, nil
  392. }
  393. func (saver *UASTChangesSaver) Finalize() interface{} {
  394. return saver.result
  395. }
  396. func (saver *UASTChangesSaver) Serialize(result interface{}, binary bool, writer io.Writer) error {
  397. saverResult := result.([][]UASTChange)
  398. fileNames := saver.dumpFiles(saverResult)
  399. if binary {
  400. return saver.serializeBinary(fileNames, writer)
  401. }
  402. saver.serializeText(fileNames, writer)
  403. return nil
  404. }
  405. func (saver *UASTChangesSaver) dumpFiles(result [][]UASTChange) []*pb.UASTChange {
  406. fileNames := []*pb.UASTChange{}
  407. for i, changes := range result {
  408. for j, change := range changes {
  409. if change.Before == nil || change.After == nil {
  410. continue
  411. }
  412. record := &pb.UASTChange{FileName: change.Change.To.Name}
  413. bs, _ := change.Before.Marshal()
  414. record.UastBefore = path.Join(saver.OutputPath, fmt.Sprintf(
  415. "%d_%d_before_%s.pb", i, j, change.Change.From.TreeEntry.Hash.String()))
  416. goioutil.WriteFile(record.UastBefore, bs, 0666)
  417. blob, _ := saver.repository.BlobObject(change.Change.From.TreeEntry.Hash)
  418. s, _ := (&object.File{Blob: *blob}).Contents()
  419. record.SrcBefore = path.Join(saver.OutputPath, fmt.Sprintf(
  420. "%d_%d_before_%s.src", i, j, change.Change.From.TreeEntry.Hash.String()))
  421. goioutil.WriteFile(record.SrcBefore, []byte(s), 0666)
  422. bs, _ = change.After.Marshal()
  423. record.UastAfter = path.Join(saver.OutputPath, fmt.Sprintf(
  424. "%d_%d_after_%s.pb", i, j, change.Change.To.TreeEntry.Hash.String()))
  425. goioutil.WriteFile(record.UastAfter, bs, 0666)
  426. blob, _ = saver.repository.BlobObject(change.Change.To.TreeEntry.Hash)
  427. s, _ = (&object.File{Blob: *blob}).Contents()
  428. record.SrcAfter = path.Join(saver.OutputPath, fmt.Sprintf(
  429. "%d_%d_after_%s.src", i, j, change.Change.To.TreeEntry.Hash.String()))
  430. goioutil.WriteFile(record.SrcAfter, []byte(s), 0666)
  431. fileNames = append(fileNames, record)
  432. }
  433. }
  434. return fileNames
  435. }
  436. func (saver *UASTChangesSaver) serializeText(result []*pb.UASTChange, writer io.Writer) {
  437. for _, sc := range result {
  438. kv := [...]string{
  439. "file: " + sc.FileName,
  440. "src0: " + sc.SrcBefore, "src1: " + sc.SrcAfter,
  441. "uast0: " + sc.UastBefore, "uast1: " + sc.UastAfter,
  442. }
  443. fmt.Fprintf(writer, " - {%s}\n", strings.Join(kv[:], ", "))
  444. }
  445. }
  446. func (saver *UASTChangesSaver) serializeBinary(result []*pb.UASTChange, writer io.Writer) error {
  447. message := pb.UASTChangesSaverResults{Changes: result}
  448. serialized, err := proto.Marshal(&message)
  449. if err != nil {
  450. return err
  451. }
  452. writer.Write(serialized)
  453. return nil
  454. }
  455. func init() {
  456. Registry.Register(&UASTExtractor{})
  457. Registry.Register(&UASTChanges{})
  458. Registry.Register(&UASTChangesSaver{})
  459. }